最近发现一个很好的网站:,里面有大量的技术书籍。于是想使用 wget 把整个网站都下载下来。但是 wget 对中文的 url 支持得不够好,直接使用:
wget -m
下载的话,中文文件名就会乱码,比如“2010架构师大会PPT”就变成了“2010鏋舵瀯甯堝ぇ浼歅PT”。
wget --restrict-file-name=ascii -m
下载的话,中文文件名会编码成URL形式,比如比如“2010架构师大会PPT”就变成了“2010%E6%9E%B6%E6%9E%84%E5%B8%88%E5%A4%A7%E4%BC%9APPT”。主要是因为在网页上,中文 UR L会以 UTF-8 来编码,而 Windows 存储文件名是用GBK编码。也就是说“2010鏋舵瀯甯堝ぇ浼歅PT”实际上是以 GBK 编码来显示的 UTF-8 编码的文件名。这样我们只要用 Python 写个编码转换器就可以了。代码如下:
- import os, urllib, sys, getopt
- class Renamer:
- input_encoding = ""
- output_encoding = ""
- path = ""
- is_url = False
- def __init__(self, input, output, path, is_url):
- self.input_encoding = input
- self.output_encoding = output
- self.path = path
- self.is_url = is_url
- def start(self):
- self.rename_dir(self.path)
- def rename(self, root, path):
- try:
- if self.is_url:
- new = urllib.unquote(path).decode(self.input_encoding).encode(self.output_encoding)
- else:
- new = path.decode(self.input_encoding).encode(self.output_encoding)
- os.rename(os.path.join(root, path), os.path.join(root, new))
- except:
- pass
- def rename_dir(self, path):
- for root, dirs, files in os.walk(path):
- for f in files:
- self.rename(root, f)
- if dirs == []:
- for f in files:
- self.rename(root, f)
- else:
- for d in dirs:
- self.rename_dir(os.path.join(root, d))
- self.rename(root, d)
- def usage():
- print ''
-
-
-
-
-
-
-
-
- def main(argv):
- input_encoding = "utf-8"
- output_encoding = "gbk"
- path = ""
- is_url = True
- try:
- opts, args = getopt.getopt(argv, "hi:o:p:u", ["help", "input-encoding=", "output-encoding=", "path=", "is-url"])
- except getopt.GetoptError:
- usage()
- sys.exit(2)
- for opt, arg in opts:
- if opt in ("-h", "--help"):
- usage()
- sys.exit()
- elif opt in ("-i", "--input-encoding"):
- input_encoding = arg
- elif opt in ("-o", "--output-encoding"):
- output_encoding = arg
- elif opt in ("-p", "--path"):
- path = arg
- elif opt in ("-u", "--is-url"):
- is_url = True
- rn = Renamer(input_encoding, output_encoding, path, is_url)
- rn.start()
- if __name__ == '__main__':
- main(sys.argv[1:])
import os, urllib, sys, getopt
class Renamer:
input_encoding = ""
output_encoding = ""
path = ""
is_url = False
def __init__(self, input, output, path, is_url):
self.input_encoding = input
self.output_encoding = output
self.path = path
self.is_url = is_url
def start(self):
self.rename_dir(self.path)
def rename(self, root, path):
try:
if self.is_url:
new = urllib.unquote(path).decode(self.input_encoding).encode(self.output_encoding)
else:
new = path.decode(self.input_encoding).encode(self.output_encoding)
os.rename(os.path.join(root, path), os.path.join(root, new))
except:
pass
def rename_dir(self, path):
for root, dirs, files in os.walk(path):
for f in files:
self.rename(root, f)
if dirs == []:
for f in files:
self.rename(root, f)
else:
for d in dirs:
self.rename_dir(os.path.join(root, d))
self.rename(root, d)
def usage():
print '''This program can change encode of files or directories.
Usage: rename.exe [OPTION]...
Options:
-h, --help this document.
-i, --input-encoding=ENC set original encoding, default is UTF-8.
-o, --output-encoding=ENC set output encoding, default is GBK.
-p, --path=PATH choose the path which to process.
-u, --is-url whether as a URL
'''
def main(argv):
input_encoding = "utf-8"
output_encoding = "gbk"
path = ""
is_url = True
try:
opts, args = getopt.getopt(argv, "hi:o:p:u", ["help", "input-encoding=", "output-encoding=", "path=", "is-url"])
except getopt.GetoptError:
usage()
sys.exit(2)
for opt, arg in opts:
if opt in ("-h", "--help"):
usage()
sys.exit()
elif opt in ("-i", "--input-encoding"):
input_encoding = arg
elif opt in ("-o", "--output-encoding"):
output_encoding = arg
elif opt in ("-p", "--path"):
path = arg
elif opt in ("-u", "--is-url"):
is_url = True
rn = Renamer(input_encoding, output_encoding, path, is_url)
rn.start()
if __name__ == '__main__':
main(sys.argv[1:])
如果 wget 是使用以下命令行来下载:
wget --restrict-file-name=ascii -m
那么下载下来的文件是“2010%E6%9E%B6%E6%9E%84%E5%B8%88%E5%A4%A7%E4%BC%9APPT”形式,运行脚本时就使用以下命令:
rename.py -i utf-8 -o gbk -p R:\ebook.elain.org -u
下载后的结果如下图。
转:http://blog.csdn.net/kowity/article/details/6899256