4
\$\begingroup\$

I'm trying to loop through around 100k files in varying degrees of nested directories using vba and md5 hashing each of those files. Right now, the current state of the program takes around 1.5 to 2 hours to run completely. What can I do to increase the speed it is running at?

Option Explicit

Public Const OriginalFolder As String = "\\path\to\original\"
Public Const NewFolder As String = "\\path\to\the\copy\"

Public Sub runRec()
    Dim StartTime As Double
    StartTime = Timer
    Dim originalFiles As Dictionary
    Set originalFiles = New Dictionary
    
    recurse OriginalFolder, OriginalFolder, originalFiles
    
    Debug.Print "Runtime to first recurse: " & Format((Timer - StartTime) / 86400, "hh:mm:ss")
    Debug.Print "Original File Count: " & originalFiles.count
    
    Dim newFiles As Dictionary
    Set newFiles = New Dictionary
    
    recurse NewFolder, NewFolder, newFiles
    
    Debug.Print "Runtime to second recurse: " & Format((Timer - StartTime) / 86400, "hh:mm:ss")
    Debug.Print "Copy File Count: " & newFiles.count
    
    'Remove matching md5 hashes
    Dim originalKey As Variant
    
    For Each originalKey In originalFiles.Keys
        If newFiles.Exists(originalKey) Then
            originalFiles.Remove originalKey
            newFiles.Remove originalKey
        End If
    Next
    
    Debug.Print "Runtime to md5: " & Format((Timer - StartTime) / 86400, "hh:mm:ss")
    
    Dim originalColl As Collection
    Set originalColl = New Collection
    Dim copyColl As New Collection
    Set copyColl = New Collection
    
    'Remove any files with same filename and relative path
    Dim copyKey As Variant
    
    For Each originalKey In originalFiles.Keys
        For Each copyKey In newFiles.Keys
            If originalFiles.item(originalKey) = newFiles.item(copyKey) Then
                originalColl.Add originalKey
                copyColl.Add copyKey
            End If
        Next
    Next
    
    For Each originalKey In originalColl
        originalFiles.Remove originalKey
    Next
    
    For Each copyKey In copyColl
        newFiles.Remove copyKey
    Next
    
    Debug.Print "Runtime to relative: " & Format((Timer - StartTime) / 86400, "hh:mm:ss")
    
    'Report any files that have the same name but different location as an anomaly
    
    prepDictionary originalFiles
    prepDictionary newFiles
    
    Set originalColl = New Collection
    
    For Each originalKey In originalFiles.Keys
        For Each copyKey In newFiles.Keys
            If originalFiles.item(originalKey) = newFiles.item(copyKey) Then
                originalColl.Add originalKey
            End If
        Next
    Next
    
    Debug.Print "Anomaly count: " & originalColl.count
    
    Debug.Print "Runtime to Anomaly : " & Format((Timer - StartTime) / 86400, "hh:mm:ss")
    
    'Report missing files
    
    Debug.Print "Original File Count: " & originalFiles.count
    Debug.Print "Copy File Count: " & newFiles.count
    
End Sub

Private Sub prepDictionary(ByRef dict As Dictionary)
    Dim dictKey As Variant
    Dim slashLocation As Long
    
    For Each dictKey In dict.Keys
        slashLocation = InStrRev(dict.item(dictKey), "\")
        
        dict.item(dictKey) = Right(dict.item(dictKey), Len(dict.item(dictKey)) - slashLocation)
    Next
End Sub

Private Sub recurse(ByVal basePath As String, ByVal folderPath As String, ByRef fileDict As Dictionary)
    Dim FSO As FileSystemObject
    Dim currentFolder As Folder
    Dim subFolder As Folder
    Dim folderFile As File
    
    Set FSO = New FileSystemObject
    
    Set currentFolder = FSO.GetFolder(folderPath)
    
    Dim fileHash As String
    For Each folderFile In currentFolder.Files
        'Need to ignore files that dont have a size and lock files
        If FileLen(folderFile.path) > 0 And Left(folderFile.Name, 2) <> "~$" And folderFile.Name <> "Thumbs.db" Then
            fileHash = FileToMD5Hex(folderFile.path)
            
            If Not fileDict.Exists(fileHash) Then
                fileDict.Add fileHash, Right(folderFile.path, Len(folderFile.path) - Len(basePath))
            End If
        End If
    Next
    
    For Each subFolder In currentFolder.SubFolders
        recurse basePath, subFolder.path, fileDict
    Next
End Sub

Public Function FileToMD5Hex(sFileName As String) As String
    Dim enc
    Dim bytes
    Dim outstr As String
    Dim pos As Integer
    Set enc = CreateObject("System.Security.Cryptography.MD5CryptoServiceProvider")
    
    'Convert the string to a byte array and hash it
    bytes = GetFileBytes(sFileName)
    bytes = enc.ComputeHash_2((bytes))
    
    'Convert the byte array to a hex string
    For pos = 1 To LenB(bytes)
        outstr = outstr & LCase(Right("0" & Hex(AscB(MidB(bytes, pos, 1))), 2))
    Next
    
    FileToMD5Hex = outstr
    
    Set enc = Nothing
End Function

Private Function GetFileBytes(ByVal path As String) As Byte()
    Dim lngFileNum As Long
    Dim bytRtnVal() As Byte
    
    lngFileNum = FreeFile
    
    If LenB(Dir(path, vbNormal + vbReadOnly + vbHidden)) Then ''// Does file exist?
        Open path For Binary Access Read As lngFileNum
        ReDim bytRtnVal(LOF(lngFileNum) - 1&) As Byte
        Get lngFileNum, , bytRtnVal
        Close lngFileNum
    Else
        Err.Raise 53
    End If
    
    GetFileBytes = bytRtnVal
    
    Erase bytRtnVal
End Function
\$\endgroup\$
9
  • \$\begingroup\$ what does that debug print? Just to get an idea of your performance measures? \$\endgroup\$ Commented Aug 1, 2024 at 10:53
  • \$\begingroup\$ @Greedo Yes, since it won't be running in any short amount of time. I'd just like to get a general idea of the performance \$\endgroup\$ Commented Aug 1, 2024 at 14:36
  • 1
    \$\begingroup\$ Sorry I meant would you be able to give the log so we can see what your profiling has found. You can edit the post to add this info \$\endgroup\$ Commented Aug 1, 2024 at 14:45
  • 1
    \$\begingroup\$ As @Greedo notes: Your own diagnostics (which parts are taking the most time) would be helpful. I have tried to follow the code, but it is difficult, it seems to follow a tortured path. I don't have time to tease out all the issues. \$\endgroup\$ Commented Aug 3, 2024 at 7:23
  • 1
    \$\begingroup\$ Out of scope of your question, I notice you raise an error at one point, rather than handling it gracefully. Why (given that there may be a good reason for this)? For me, any Err action is a red flag unless properly explained. \$\endgroup\$ Commented Aug 3, 2024 at 7:29

1 Answer 1

3
\$\begingroup\$

Get files

You can speed up retrieving the files by importing LibFileTools module from VBA-FileTools. Your recurse method then becomes:

Private Sub recurse(ByVal folderPath As String, ByRef fileDict As Dictionary)
    Dim filePath As Variant
    Dim fileHash As String
    '
    For Each filePath In GetFiles(folderPath, includeSubFolders:=True)
        If (Not filePath Like "*~$*") And (Not filePath Like "*Thumbs.db") Then
            fileHash = FileToMD5Hex(CStr(filePath))
            If Not fileDict.Exists(fileHash) Then
                fileDict.Add fileHash, Right(filePath, Len(filePath) - Len(folderPath))
            End If
        End If
    Next
End Sub

Since you're ignoring duplicated hashes, and storing just first found, you could just store last found by updating to this:

Private Sub recurse(ByVal folderPath As String, ByRef fileDict As Dictionary)
    Dim filePath As Variant
    '
    For Each filePath In GetFiles(folderPath, includeSubFolders:=True)
        If (Not filePath Like "*~$*") And (Not filePath Like "*Thumbs.db") Then
            fileDict(FileToMD5Hex(CStr(filePath))) = Right(filePath, Len(filePath) - Len(folderPath))
        End If
    Next
End Sub

Of course you will then need to update the main runRec function and:

'Replace this:
recurse OriginalFolder, OriginalFolder, originalFiles
'with this:
recurse OriginalFolder, originalFiles

'And replace this:
recurse NewFolder, NewFolder, newFiles
'with this:
recurse NewFolder, newFiles

Finally, the GetFiles function does the recursion for you so you might want to rename the method from recurse to something like HashFilesMD5.

Hashing

We can improve on the FileToMD5Hex function by:

  1. using a map for the hex values - this avoid unnecessary function calls
  2. using a buffer for the result - because string concatenation is slow
Public Function FileToMD5Hex(sFileName As String) As String
    Static enc As Object
    Dim bytes() As Byte
    If enc Is Nothing Then Set enc = CreateObject("System.Security.Cryptography.MD5CryptoServiceProvider")
    
    ReadBytes sFileName, bytes
    On Error Resume Next
    bytes = enc.ComputeHash_2((bytes))
    If Err.Number <> 0 Then
        Err.Clear
        Exit Function
    End If
    On Error GoTo 0
    
    'Convert the byte array to a hex string
    Static map(0 To 255) As String
    Dim i As Long
    
    'Cache hex values for bytes
    If LenB(map(0)) = 0 Then
        For i = 0 To 255
            map(i) = LCase(Right$("0" & Hex$(i), 2))
        Next i
    End If

    FileToMD5Hex = Space$((UBound(bytes) - LBound(bytes) + 1) * 2)
    For i = LBound(bytes) To UBound(bytes)
        Mid$(FileToMD5Hex, i * 2 + 1) = map(bytes(i))
    Next i
End Function

Notice that ReadBytes is already part of same library.

Mac

In case you want to use this on a Mac operating system, then you need to update the prepDictionary method to something like this:

Private Sub prepDictionary(ByRef dict As Dictionary)
    Dim dictKey As Variant
    Dim dictItem As String
    Dim separatorLocation As Long
    
    For Each dictKey In dict
        dictItem = dict.Item(dictKey)
        separatorLocation = InStrRev(dictItem, PATH_SEPARATOR)
        dict.Item(dictKey) = Right$(dictItem, Len(dictItem) - separatorLocation)
    Next
End Sub

Notice that PATH_SEPARATOR is a constant defined in the same library I linked to.

Dictionary

You can import the Dictionary class from VBA-FastDictionary to make your code work on Mac. The added advantage is that this class is generally faster on Windows as well, especially for more than 32k key-item pairs - see benchmarking.

Final code

Notice that I removed all occurences of [dict].Keys and left only [dict] because a For Each.. loop on a dictionary will loop through the keys using a faster iterator object while looping .Keys makes a copy of the internal array of keys first before the iteration happens on the actual array.

Option Explicit

Public Const OriginalFolder As String = "\\path\to\original\"
Public Const NewFolder As String = "\\path\to\the\copy\"

Public Sub runRec()
    Dim StartTime As Double
    StartTime = Timer
    Dim originalFiles As Dictionary
    Set originalFiles = New Dictionary
    
    HashFilesMD5 OriginalFolder, originalFiles
    
    Debug.Print "Runtime to first recurse: " & Format((Timer - StartTime) / 86400, "hh:mm:ss")
    Debug.Print "Original File Count: " & originalFiles.Count
    
    Dim newFiles As Dictionary
    Set newFiles = New Dictionary
    
    HashFilesMD5 NewFolder, newFiles
    
    Debug.Print "Runtime to second recurse: " & Format((Timer - StartTime) / 86400, "hh:mm:ss")
    Debug.Print "Copy File Count: " & newFiles.Count
    
    'Remove matching md5 hashes
    Dim originalKey As Variant
    
    For Each originalKey In originalFiles
        If newFiles.Exists(originalKey) Then
            originalFiles.Remove originalKey
            newFiles.Remove originalKey
        End If
    Next
    
    Debug.Print "Runtime to md5: " & Format((Timer - StartTime) / 86400, "hh:mm:ss")
    
    Dim originalColl As Collection
    Set originalColl = New Collection
    Dim copyColl As New Collection
    Set copyColl = New Collection
    
    'Remove any files with same filename and relative path
    Dim copyKey As Variant
    
    For Each originalKey In originalFiles
        For Each copyKey In newFiles
            If originalFiles.Item(originalKey) = newFiles.Item(copyKey) Then
                originalColl.Add originalKey
                copyColl.Add copyKey
            End If
        Next
    Next
    
    For Each originalKey In originalColl
        originalFiles.Remove originalKey
    Next
    
    For Each copyKey In copyColl
        newFiles.Remove copyKey
    Next
    
    Debug.Print "Runtime to relative: " & Format((Timer - StartTime) / 86400, "hh:mm:ss")
    
    'Report any files that have the same name but different location as an anomaly
    
    PrepDictionary originalFiles
    PrepDictionary newFiles
    
    Set originalColl = New Collection
    
    For Each originalKey In originalFiles
        For Each copyKey In newFiles
            If originalFiles.Item(originalKey) = newFiles.Item(copyKey) Then
                originalColl.Add originalKey
            End If
        Next
    Next
    
    Debug.Print "Anomaly count: " & originalColl.Count
    
    Debug.Print "Runtime to Anomaly : " & Format((Timer - StartTime) / 86400, "hh:mm:ss")
    
    'Report missing files
    
    Debug.Print "Original File Count: " & originalFiles.Count
    Debug.Print "Copy File Count: " & newFiles.Count
    
End Sub

Private Sub PrepDictionary(ByRef dict As Dictionary)
    Dim dictKey As Variant
    Dim dictItem As String
    Dim separatorLocation As Long
    
    For Each dictKey In dict
        dictItem = dict.Item(dictKey)
        separatorLocation = InStrRev(dictItem, PATH_SEPARATOR)
        dict.Item(dictKey) = Right$(dictItem, Len(dictItem) - separatorLocation)
    Next
End Sub

Private Sub HashFilesMD5(ByVal folderPath As String, ByRef fileDict As Dictionary)
    Dim filePath As Variant
    '
    For Each filePath In GetFiles(folderPath, includeSubFolders:=True)
        If (Not filePath Like "*~$*") And (Not filePath Like "*Thumbs.db") Then
            fileDict(FileToMD5Hex(CStr(filePath))) = Right(filePath, Len(filePath) - Len(folderPath))
        End If
    Next
End Sub

Public Function FileToMD5Hex(sFileName As String) As String
    Static enc As Object
    Dim bytes() As Byte
    If enc Is Nothing Then Set enc = CreateObject("System.Security.Cryptography.MD5CryptoServiceProvider")
    
    ReadBytes sFileName, bytes
    On Error Resume Next
    bytes = enc.ComputeHash_2((bytes))
    If Err.Number <> 0 Then
        Err.Clear
        Exit Function
    End If
    On Error GoTo 0
    
    'Convert the byte array to a hex string
    Static map(0 To 255) As String
    Dim i As Long
    
    'Cache hex values for bytes
    If LenB(map(0)) = 0 Then
        For i = 0 To 255
            map(i) = LCase(Right$("0" & Hex$(i), 2))
        Next i
    End If

    FileToMD5Hex = Space$((UBound(bytes) - LBound(bytes) + 1) * 2)
    For i = LBound(bytes) To UBound(bytes)
        Mid$(FileToMD5Hex, i * 2 + 1) = map(bytes(i))
    Next i
End Function

Final thoughts

I did not modify code like this:

For Each originalKey In originalFiles
    For Each copyKey In newFiles
        If originalFiles.Item(originalKey) = newFiles.Item(copyKey) Then
            originalColl.Add originalKey
            copyColl.Add copyKey
        End If
    Next
Next

but if I were to try to improve it, I would probably create a new dictionary with the keys being the actual paths in originalFiles and then just use dict.Exists while looping through the files in newFiles.

\$\endgroup\$

You must log in to answer this question.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.