Find duplicate files with regex exclude


/ Published in: Python
Save to your folder(s)



Copy this code and paste it in your HTML
  1. #-------------------------------------------------------------------------------
  2. # Name: findduplicates
  3. # Purpose:
  4. #
  5. # Author: ysharma
  6. #
  7. # Created: 09-09-2011
  8. # Copyright: (c) ysharma 2011
  9. # Licence: <your licence>
  10. #-------------------------------------------------------------------------------
  11. #!/usr/bin/env python
  12. import glob
  13. import re
  14. import md5
  15.  
  16. def get_duplicates(filelist): ## {
  17. exclude = [ re.compile(pattern) for pattern in ['^\s*#','^\s*$']]
  18. dup={}
  19. for file in filelist: ## {
  20. m=md5.new()
  21. for line in open(file).readlines(): ## {
  22. skip=0
  23. for pattern in exclude:
  24. if pattern.search(line):
  25. skip=1
  26. if skip:
  27. continue
  28.  
  29. m.update(line)
  30. ##}
  31. filehash=m.hexdigest()
  32. dup.setdefault(filehash,[]).append(file)
  33.  
  34. ##}
  35.  
  36. return [paths for paths in dup.values() if len(paths) > 1]
  37. ##}
  38.  
  39. def main(): ## {
  40. duplicate_files = get_duplicates(glob.glob("omx_proj/impl_1/jobs/job_*/*arun.tcl"))
  41. if len(duplicate_files): ## {
  42. print "Following files are duplicate:"
  43. for files in duplicate_files: ## {
  44. print files
  45. ##}
  46. ##}
  47. ##}
  48.  
  49. if __name__ == '__main__':
  50. main()

Report this snippet


Comments

RSS Icon Subscribe to comments

You need to login to post a comment.