Return to Snippet

Revision: 50947
at September 9, 2011 19:52 by mail2yogs


Initial Code
#-------------------------------------------------------------------------------
# Name:        findduplicates
# Purpose:
#
# Author:      ysharma
#
# Created:     09-09-2011
# Copyright:   (c) ysharma 2011
# Licence:     <your licence>
#-------------------------------------------------------------------------------
#!/usr/bin/env python
import glob
import re
import md5

def get_duplicates(filelist): ## {
    exclude = [ re.compile(pattern) for pattern in ['^\s*#','^\s*$']]
    dup={}
    for file in filelist: ## {
        m=md5.new()
        for line in open(file).readlines(): ## {
            skip=0
            for pattern in exclude:
                if pattern.search(line):
                    skip=1
            if skip:
                continue

            m.update(line)
        ##}
        filehash=m.hexdigest()
        dup.setdefault(filehash,[]).append(file)

    ##}

    return [paths for paths in dup.values() if len(paths) > 1]
##}

def main(): ## {
    duplicate_files = get_duplicates(glob.glob("omx_proj/impl_1/jobs/job_*/*arun.tcl"))
    if len(duplicate_files): ## {
        print "Following files are duplicate:"
        for files in duplicate_files: ## {
            print files
        ##}
    ##}
##}

if __name__ == '__main__':
    main()

Initial URL


Initial Description


Initial Title
Find duplicate files with regex exclude

Initial Tags
python

Initial Language
Python