compare_dirs_by_content.py
#!/usr/bin/env python
# Bernd Ragutt
#  Purpose:
#     The script compares the text files of two directories, files with a given extension,
#     ignoring differences in white space the following way: 
#     For byte by byte comparison sequences of white space characters are condensed 
#     to just one empty space character.
#
#  Usage:
#     compare_dirs_by_content.py {Extension} NameOfDirectory1 NameOfDirectory2
#
#  Default extension:
#     .ada
#
#  Note:
#      The extension begins with a dot '.'
#      Directory names may be relative oe absolute.
#      A Separator at the end of a name is optional.
#      Names containing empty spaces must be enclosed by quotation marks '"'.
#
#  Example (Unix):
#     compare_dirs_by_content.py .txt /home/ra/temp /home/ra/temp2/
#
#  Examples (Windows):
#     compare_dirs_by_content.py F:\Projekte\Python\differ "F:\Projekte\Python\differ temp"
#     F:\Projekte\Software\Python\differ\compare_dirs_by_content.py differ "differ temp"
#
#  For different files
#     the first different non white character is marked with a '^'. Its position
#     in the files is named 'At' in the output. 'L1-L2' is the difference of
#     lengths of files. These numbers are numbers for internal files with all
#     substrings "[ \t\n\r\f\v]+" replaced by just one empty space character.
#
#  Example of output:
#
##   # Comparing contents of files - ignoring white space ...
##   
##   > Directory 1: F:\Projekte\Python\differ\
##   > Directory 2: F:\Projekte\Python\differ temp\
##   > Extension  : .ada
##   
##   - Files NOT in directory 2:
##       Number Of files: 2
##   
##     * dummy.ada
##     * temp 1.ada
##   
##   - Files NOT in directory 1:
##       Number Of files: 2
##   
##     * dummy_temp.ada
##     * temp.ada
##   
##   + Equal files - apart from white space:
##       Number Of files: 1
##   
##     * cs1.ada
##   
##   ! Different files: - apart from white space
##       Number Of files: 1
##   
##     * cs 2.ada
##                F1: "True, ZaX => True, Sap => True, Tan => True,"
##                F2: "True, Zan => True, Sap => True, Tan => True,"
##                             #   
##                At:  2281
##             L1-L2:  9
##   
##   
##   # Compared.
#
#  Note: Names containing empty spaces are not enclosed by quotation marks.

import os;

if __name__!="__main__":
    import PySide2.QtCore;

def getData(FileName):
    WhiteSpaceChars=[' ','\t','\n','\r','\f','\v'];
    PurgedData='';

    try:
        fRawData=open(FileName,'r');
        RawData=fRawData.read().strip();
        fRawData.close();
    except:
        print('\n! File ERROR - Cannot open/read file: '+FileName);
        raise IOError;
    # end try

    AllowEmptySpace=True;

    for c in RawData:
        if c in WhiteSpaceChars:
            if AllowEmptySpace:
                PurgedData+=' ';
                AllowEmptySpace=False;
            # end if
        else:
            PurgedData+=c;
            AllowEmptySpace=True;
        # end if
    # end for

    return PurgedData;

# end getData()


def compareFiles(NameOfFile1,NameOfFile2,MatchCase,NoSpaceAtAll):
    Are_Equal=False;
    FirstDifferentCharNb=0;

    PurgedData1=getData(NameOfFile1);
    PurgedData2=getData(NameOfFile2);

    if NoSpaceAtAll:
        PurgedData1=PurgedData1.replace(' ','');
        PurgedData2=PurgedData2.replace(' ','');
    # end if

    if not MatchCase:
        PurgedData1=PurgedData1.lower();
        PurgedData2=PurgedData2.lower();
    # end if

    if PurgedData1==PurgedData2:
        Are_Equal=True;
        return [Are_Equal];
    # end if

    len1=len(PurgedData1);
    len2=len(PurgedData2);
    Length_Diff_By=len1-len2;

    for I in range(min(len1,len2)):
        if PurgedData1[I]!=PurgedData2[I]:
            FirstDifferentCharNb=I+1;
            break;
    # end for

    min_nb=max(FirstDifferentCharNb-9,0);
    max1_nb=min(FirstDifferentCharNb+35,len1-1);
    max2_nb=min(FirstDifferentCharNb+35,len2-1);

    return [Are_Equal,
            Length_Diff_By,
            FirstDifferentCharNb,
            PurgedData1[min_nb:max1_nb],
            PurgedData2[min_nb:max2_nb]];

# end compareFiles()


def getListOfFiles(NameOfDir,Extension):
    NonFiles=[];

    try:
        ListOfFiles=os.listdir(NameOfDir);
    except:
        print('\n! Directory ERROR - Cannot access: '+NameOfDir);
        raise IOError;
    # end try

    for f in ListOfFiles:
        if not f.endswith(Extension):
            NonFiles.append(f);
    # end for

    for f in NonFiles:
        ListOfFiles.remove(f);

    ListOfFiles.sort();
    return ListOfFiles;

# end getListOfFiles()


def compareDirs(NameOfDir1,NameOfDir2,Extension,MatchCase,NoSpaceAtAll):
    ListOfFiles1_NotIn2=[];
    ListOfFiles2_NotIn1=[];

    EqualFiles=[];
    DifferentFiles=[];
    Outputs=[];

    ListOfFiles1=getListOfFiles(NameOfDir1,Extension);
    ListOfFiles2=getListOfFiles(NameOfDir2,Extension);

    for f in ListOfFiles1:
        if ListOfFiles2.count(f)==0:
            ListOfFiles1_NotIn2.append(f);
    # end for

    for f in ListOfFiles2:
        if ListOfFiles1.count(f)==0:
            ListOfFiles2_NotIn1.append(f);
    # end for

    for f in ListOfFiles1_NotIn2:
        ListOfFiles1.remove(f);

    for f in ListOfFiles2_NotIn1:
        ListOfFiles2.remove(f);

    if ListOfFiles1!=ListOfFiles2:
        print('\n### DEVELOPMENT ERROR ###\n');

    for f in ListOfFiles1:
        Result=compareFiles(NameOfDir1+f,NameOfDir2+f,MatchCase,NoSpaceAtAll);

        if __name__!="__main__":
            PySide2.QtCore.QCoreApplication.processEvents();

        if (Result[0]):
            EqualFiles.append(f);
        else:
            DifferentFiles.append([f,Result[1],Result[2],Result[3],Result[4]]);
        # end if
    # end for

    Outputs.append('\n- Files NOT in directory 2:');
    Outputs.append('  Number Of files: '+str(len(ListOfFiles1_NotIn2)));

    for f in ListOfFiles1_NotIn2:
        Outputs.append('  * '+f);

    Outputs.append('\n- Files NOT in directory 1:');
    Outputs.append('  Number Of files: '+str(len(ListOfFiles2_NotIn1)));

    for f in ListOfFiles2_NotIn1:
        Outputs.append('  * '+f);

    Outputs.append('\n+ Equal files - apart from white space:');
    Outputs.append('  Number Of files: '+str(len(EqualFiles)));

    for f in EqualFiles:
        Outputs.append('  * '+f);

    Outputs.append('\n! Different files: - apart from white space');
    Outputs.append('  Number Of files: '+str(len(DifferentFiles)));

    for f in DifferentFiles:
        FirstDifferentCharNb=f[2];

        if FirstDifferentCharNb<10:
            Blanks='                 ';
            for I in range(FirstDifferentCharNb):
                Blanks+=' ';
        else:
            Blanks='                          ';  # Max 26 blanks
        # end if    

        Outputs.append('  * '+f[0]);
        Outputs.append('             F1: "'+f[3]);
        Outputs.append('             F2: "'+f[4]);
        Outputs.append(Blanks+'^');
        Outputs.append('             At:  '+str(f[2]));
        Outputs.append('          L1-L2:  '+str(f[1]));
    # end for

    return Outputs;

# end compareDirs()

# ------------------------------------------------------------------------------#
# ------------------------------------------------------------------------------#

if __name__=="__main__":
    import sys;

    MatchCase=True;
    NoSpaceAtAll=False;

    Extension='.ada';

    print('\n\n# Comparing contents of files - apart from white space ...\n');
    print('# MatchCase  : '+str(MatchCase));

    if (len(sys.argv)==4):
        Extension=sys.argv[1];
        NameOfDir1=sys.argv[2];
        NameOfDir2=sys.argv[3];
    elif (len(sys.argv)==3):
        NameOfDir1=sys.argv[1];
        NameOfDir2=sys.argv[2];
    else:
        print('\n! Input ERROR - 2 or 3 arguments are expected:');
        print('!    an optional file extension (eg: .txt - default: .ada)');
        print('!    and 2 directory names (eg for Unix: /home/ra/tmp1 /home/ra/tmp2 ))');
        print('!    Separators al the end of the names are optional.\n');
        print('# Compared.\n');
        sys.exit();
    # enf if

    if NameOfDir1==NameOfDir2:
        print('\n! Input ERROR - The names of the directories are identical!\n');
        print('# Compared.\n');
        sys.exit();
    # end if

    if not NameOfDir1.endswith(os.sep):
        NameOfDir1+=os.sep;

    if not NameOfDir2.endswith(os.sep):
        NameOfDir2+=os.sep;

    print('# Extension  : '+Extension);
    print('\n# Directory 1: '+NameOfDir1);
    print('# Directory 2: '+NameOfDir2);

    try:
        Outputs=compareDirs(NameOfDir1,NameOfDir2,Extension,MatchCase,NoSpaceAtAll);
        for l in Outputs: print(l);
    except:
        print('\n\n! File ERROR - An exception occured.');
        print('             - Note: The files must be text files.');
    # end try

    print('# Compared.');
# end if main

# ------------------------------------------------------------------------------#
# ------------------------------------------------------------------------------#