实现存储目录内重复文件以及重复文件个数的统计,输出统计文件
#!/usr/bin/python
# -*- coding: UTF-8 -*-
import os
import datetime
import platform
import sys
import hashlib
def CalcFileSha256(filname):
''' calculate file sha256 '''
bufsize =1024 *1024 *16
with open(filname,"rb")as f:
sha256obj = hashlib.sha256()
while True:
data = f.read(bufsize)
if data ==None or len(data) ==0:
break
sha256obj.update(data)
hash_value = sha256obj.hexdigest()
return hash_value
class CoSailTextFile():
def __init__(self, filename =None):
self.filename = filename
self.f =None
def open(self, m):
try:
self.f = open(self.filename, m)
except Exceptionas e:
print(e)
self.f =None
return False
return True
def writeLine(self, line):
try:
self.f.write(line +"\n")
return True
except Exceptionas e:
print(e)
return False
def writeLines(self, lines):
for linein lines:
if self.writeLine(line) ==False:
return False
return True
def readLine(self):
text =None
try:
text = self.f.readline()
except Exceptionas e:
print(e)
return None
return text
def readAll(self):
return self.f.read()
def readLines(self):
return self.f.readlines()
def close(self):
if self.f !=None:
try:
self.f.close()
self.f =None
except Exceptionas e:
print(e)
return False
return True
return True
def isOpen(self):
return self.f !=None
def getFilePathExtend(fileName):
filePath, suffix = os.path.splitext(fileName)
filePath = os.path.dirname(fileName)
l = len(suffix)
fileName = os.path.split(fileName)[1]
return filePath, suffix[1:l +1], fileName
filesdict = {}
def scansamenamefile(path):
if os.path.exists(path) ==False:
print("Store Path :" + path +" isn't exist!")
return None
try:
files = os.listdir(path)
except FileNotFoundError:
print("File Not FoundError")
return None
for fiin files:
# file full path
# sleep 1
# time.sleep(1)
fi_d = os.path.join(path, fi)
samefilelist =None
# is directory
if os.path.isdir(fi_d):
if os.path.islink(fi_d):
continue
""" recursion"""
scansamenamefile(fi_d)
# print("dir = ", fi_d)
# is file
else:
file_path, suffix, file_name = getFilePathExtend(fi_d)
tmpfullpath = file_path +"/" + file_name
try:
samefilelist = filesdict[file_name]
except Exceptionas e:
print(e)
samefilelist =None
if samefilelist ==None:
samefilelist = []
filesdict[file_name] = samefilelist
fileattrdict = {}
fileattrdict["filepath"] = tmpfullpath
fileattrdict["size"] = os.path.getsize(tmpfullpath)
fileattrdict["sha256"] = CalcFileSha256(tmpfullpath)
samefilelist.append(fileattrdict)
return filesdict
def statallfiles(d):
v =0;
o =None
for kin d:
o = d[k]
v += len(o)
return v
def statsamenamefiles(d):
v =0;
o =None
for kin d:
o = d[k]
if len(o) >=2:
v +=1
return v
def outduplicatefilemsg(d, textFile):
v =0;
o =None
textFile.writeLine("")
textFile.writeLine("**************Duplicate file list*****************")
for kin d:
o = d[k]
if len(o) >=2:
textFile.writeLine("")
txt ="file name : " + k +" Duplicate : " +str(len(o))
textFile.writeLine(txt)
txt ="file list :"
for objin o:
txt ="fileputh :" + obj["filepath"] +" size : " + str(obj["size"]) +" SHA256 = " + obj["sha256"]
textFile.writeLine(txt)
def outonefilemsg(d, textFile):
v =0;
o =None
textFile.writeLine("")
textFile.writeLine("**************Unique file list*****************")
for kin d:
o = d[k]
if len(o) ==1:
for objin o:
txt ="fileputh :" + obj["filepath"] +" size : " + str(obj["size"])
textFile.writeLine(txt)
def statpathfileattr(pathlistfilename, logpath):
"""
:param pathlistfilename: 要统计的文件目录列表名,每行一个目录 :param logpath 日志文件路径:return:
"""
"""
"""
if logpathis None:
"""
如果日志路径为空,则以输入文件列表路径为日志存储路径"""
logpath, _, _= getFilePathExtend(pathlistfilename)
if os.path.exists(pathlistfilename) ==False:
print("path list filename isn't exist!")
return
if os.path.exists(logpath) ==False:
print("log path isn't exist")
return
f = CoSailTextFile(pathlistfilename)
f.open("r")
lines = f.readLines()
for linein lines:
line = line.strip('\n')
statpathfileattrbypath(line, logpath)
def statpathfileattrbypath(path, logpath):
"""
统计指定路径的文件数量以及重复文件,并输出统计信息
:param path: 要统计的路径 :param logpath: 统计信息存储路基
:return:
"""
d = scansamenamefile(path)
if d ==None :
return
print("allfiles", statallfiles(d))
print("sameallfiles", statsamenamefiles(d))
t = path.replace("/","-");
t = t.replace("\\","-");
t = t.replace(":","")
if (platform.system() =='Windows'):
pass
elif (platform.system()=='Linux'):
t = t[1:]
lonfilename = logpath +"/" + t + datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S') +".log"
lonfilename = lonfilename.replace("//","/")
f = CoSailTextFile(lonfilename)
f.open("w")
f.writeLine("stat path : " + path)
f.writeLine("all files : " + str(statallfiles(d)))
f.writeLine("Duplicate files : " + str(statsamenamefiles(d)))
outduplicatefilemsg(d, f)
outonefilemsg(d, f)
f.close()
#print("file_name : ", file_path + "/" + file_name)
if __name__ =='__main__':
filesdict.clear()
filesdict={}
argv = sys.argv
print(argv)
if (len(argv) ==3):
statpathfileattr(argv[1], argv[2])
elif (len(argv) ==2):
statpathfileattr(argv[1],None)
else:
statpathfileattr("E:/1.txt","E:/")
print(type(CalcFileSha256("E:/gsl2.4.zip")))
sha2561 =CalcFileSha256("E:/gsl2.4.zip")
sha2562 = CalcFileSha256("E:/1.txt")
if (sha2561 == sha2562):
print("true")
print(sha2562)
print(sha2561)
#d = buildfilenamedictformfilelist("/home/hadoop/tmp/dir-and-files.list")
"""存储目的路径根目录"""
"""
destrootpath = "/home/hadoop/tmp4"
path = "E:/data"
d = scansamenamefile(path)
print(d)
for k in d:
print("key =", k)
print("files = ", len(d[k]))
print("allfiles", statallfiles(d))
print("sameallfiles", statsamenamefiles(d))
f = CoSailTextFile("e:/sss.log")
f.open("w")
f.writeLine("stat path : " + path)
f.writeLine("all files : " + str(statallfiles(d)))
f.writeLine("Duplicate files : " +str(statsamenamefiles(d)))
outduplicatefilemsg(d, f)
outonefilemsg(d, f)
f.close()
f = CoSailTextFile("E:/1.txt")
f.open("r")
text = f.readLines()
print((text))
"""