1. 这个脚本很短,但是有几个tricky的地方,我目录下有管道一个,还有名字中带有空格文件和名字中带有‘的文件,在把他们加入到列表的时候没有问题,但是使用subprocess.popen()就会报错。所以我对cmd进行了处理。 同时使用stat.S_ISFIFO(os.stat(i).st_mode) 来剔除了pipe。
-
import os
-
import subprocess
-
import stat
-
-
def print_all_md5(dirname):
-
ll=[]
-
for root,dirs,files in os.walk(dirname):
-
for name in files:
-
ll.append((os.path.join(root,name)))
-
-
for i in ll:
-
if stat.S_ISFIFO(os.stat(i).st_mode):
-
ll.remove(i)
-
-
-
for filename in ll:
-
if filename.endswith('txt'):
-
if "'" in filename:
-
filename=filename.replace("'","\\'")
-
if " " in filename:
-
filename=filename.replace(" ","\ ")
-
cmd="md5sum {0}".format(filename)
-
fp=subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
-
(s,e)=fp.communicate()
-
result=(s.decode()).strip()
-
e=(e.decode()).strip()
-
print(result,e)
-
-
print_all_md5(dirname=".")
稍微作一下修改,可以找到目录下所有md5digest一样的文件。
-
import os
-
import subprocess
-
import stat
-
-
def print_all_md5(dirname,suffix):
-
ll=[]
-
d={}
-
for root,dirs,files in os.walk(dirname):
-
for name in files:
-
"""
-
if "'" in name:
-
name=name.replace("'","\'")
-
#print(name)
-
if " " in name:
-
name=name.replace(" ","\ ")
-
#if name.endswith('txt'):
-
#print(name)
-
"""
-
ll.append((os.path.join(root,name)))
-
-
for i in ll:
-
if stat.S_ISFIFO(os.stat(i).st_mode):
-
ll.remove(i)
-
-
-
for filename in ll:
-
if filename.endswith(suffix):
-
if "'" in filename:
-
filename=filename.replace("'","\\'")
-
if " " in filename:
-
filename=filename.replace(" ","\ ")
-
cmd="md5sum {0}".format(filename)
-
fp=subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
-
(s,e)=fp.communicate()
-
result=(s.decode()).strip()
-
e=(e.decode()).strip()
-
#print(result,e)
-
checksum,*nouse=result.split()
-
if checksum in d:
-
d[checksum].append(filename)
-
else:
-
d[checksum]=[filename]
-
return d
-
-
-
def print_duplicates(d):
-
for key, names in d.items():
-
if len(names) >1:
-
print('The following files have the same checksum')
-
for name in names:
-
print(name)
-
-
-
def find_dup_file():
-
d=print_all_md5(dirname=".",suffix=".txt")
-
print_duplicates(d)
-
find_dup_file()
-
2. 回头我准备用pathlib重写下path 那一段,pathlib 提供了更多查询文件类型的功能。
-
import pathlib
-
def my_walk(dirname):
-
"""I will use pathlib to rewrite this function"""
-
if '__pycache__' in dirname:
-
return name
-
-
p=Path(dirname)
-
-
ll=[i for i in p.glob('**/*') if not i.is_fifo() and not i.is_socket() ]
-
-
return ll
3. 感谢下另外一个网友的热心,使用hashlib比我的invoke md5sum的代码要好。
-
def print_md5(dirname):
-
files = glob.glob("*.txt")
-
for f in files:
-
-
print ("=================")
-
md5file=open(f,'br')
-
md5=hashlib.md5(md5file.read()).hexdigest()
-
md5file.close()
-
print (md5,f)
-
-
print_md5(dirname=".")
阅读(1697) | 评论(0) | 转发(0) |