i have created a script which catches files in a folder and does conversion operation (from word to pdf) and it creates folders based on their file names and places them in there. However, problem is that if there is many files the conversion operation is slow. Here is the code, but i am not sure which way to go. If i should try to learn multithreading or async file handling for this particular problem.
import os
import time
from watchdog.observers import Observer
from watchdog.events import FileSystemEventHandler
from win32com import client
import pythoncom
import shutil
import asyncio
from docx2pdf import convert
"""import aspose.words as aw
"""
baseAd = r"C:\inetpub\wwwroot\utkuploads"
"""This part is for catching errors.
"""
def createText(filename, filedetail):
with open(r"C:\inetpub\wwwroot\utkuploads\{filename}.txt".format(filename=filename), 'w') as f:
f.write(f'{filedetail}')
"""This doc2pdf works with WORD in backend. It opens word and converts the file to pdf.
"""
def doc2pdf(doc_name, pdf_name):
pythoncom.CoInitialize()
word = client.DispatchEx("Word.Application")
if os.path.exists(pdf_name):
os.remove(pdf_name)
worddoc = word.Documents.Open(doc_name, ReadOnly=1)
try:
worddoc.SaveAs(pdf_name, FileFormat=17)
except Exception as e:
createText('saveasExceptionXX', f"{e}")
worddoc.Close()
# Quit the Word application
word.Quit()
pythoncom.CoUninitialize()
return pdf_name
"""def doc2pdfX(doc_name, pdf_name):
#second best
convert(doc_name, pdf_name)"""
"""def doc2pdf2zz(doc_name, pdf_name):
#best
doc = aw.Document(doc_name)
doc.save(pdf_name)"""
class DocFileHandler(FileSystemEventHandler):
def is_temporary_file(self, filename):
return filename.startswith("~$")
"""To create folders we use the below, it takes baseAd which is defined at the beginning and folderName as parameters."""
def createFolder(self,baseAd,folderName):
path = f"{baseAd}\{folderName}"
isExist = os.path.exists(path)
if not isExist:
os.makedirs(path)
return path
else:
return path
"""To create folders we use the below, it takes baseAd which is defined at the beginning and folderName as parameters."""
def createFolderAtt(self,folderName):
isExist = os.path.exists(folderName)
if not isExist:
os.makedirs(folderName)
return folderName
else:
return folderName
def on_created(self, event):
try:
"""The first [0] is root directory utkuploads the second is the file name with extension"""
currentFileName = os.path.split(event.src_path)
currentFileNameSplitted = os.path.split(event.src_path)[-1]
if '.tmp' not in currentFileNameSplitted:
pass
print(f'File name {currentFileName}, splittted: {currentFileNameSplitted} has entered to server.')
"""If it is directory just pass don't do anything."""
if event.is_directory:
return
#"""This part needs to work for the files that needs to be converted to PDF"""
#It catches DOCX files and takes their location by doc_path and creates a fake pdf_path directory
elif event.event_type == 'created' and event.src_path.lower().endswith('.docx') and '@' not in currentFileNameSplitted and not self.is_temporary_file(
event.src_path):
doc_path = event.src_path
pdf_path = os.path.splitext(doc_path)[0] + '.pdf'
#print(f'Doc path: {doc_path}, \nPdf path: {pdf_path}')
# If '_' in doc_path
if '_' in doc_path:
print(f'New Template has been detected: {doc_path}')
return
# If file is not temporary, not _ (template), not attachment, not TEMPLATE-REPORT
elif '~$' not in doc_path and '_' not in doc_path and '@' not in doc_path and 'TEMPLATE-REPORT' not in doc_path:
#print(f"File will be converted here: {doc_path}")
try:
if '-GENERATED-REPORT' in doc_path:
# Here pdf convertion happens.
doc2pdf(doc_path, pdf_path)
# Create subFolder based on PDF file.
createFolderPath = os.path.split(pdf_path)[-1].split(".")[0]
createFolderPath = createFolderPath.replace('-GENERATED-REPORT', '')
newFolderPath = self.createFolder(baseAd, createFolderPath)
#print(f"New folder has been created: {newFolderPath}")
pdfFileName = os.path.split(pdf_path)[-1]
src_pdf = pdf_path
dest_pathPdf = os.path.join(newFolderPath, pdfFileName)
shutil.move(src_pdf, dest_pathPdf)
#print(f"File has been moved to its destination. src: {src_pdf}, destination: {dest_pathPdf} ")
#print('Doc path', doc_path)
wordFileName = os.path.split(doc_path)[-1]
wordPdf = wordFileName
dest_pathWord = os.path.join(newFolderPath, wordPdf)
shutil.move(doc_path, dest_pathWord)
#print( f"Generated Rapor File has been moved to its destination. src: {doc_path}, destination: {dest_pathWord} ")
elif '-IMZALIRAPOR' in doc_path:
# Here pdf convertion happens.
doc2pdf(doc_path, pdf_path)
# Create subFolder based on PDF file.
createFolderPath = os.path.split(pdf_path)[-1].split(".")[0]
createFolderPath = createFolderPath.replace('-IMZALIRAPOR', '')
newFolderPath = self.createFolder(baseAd, createFolderPath)
#print(f"New folder has been created: {newFolderPath}")
pdfFileName = os.path.split(pdf_path)[-1]
src_pdf = pdf_path
dest_pathPdf = os.path.join(newFolderPath, pdfFileName)
shutil.move(src_pdf, dest_pathPdf)
#print(f"File has been moved to its destination. src: {src_pdf}, destination: {dest_pathPdf} ")
#print('Doc path', doc_path)
wordFileName = os.path.split(doc_path)[-1]
wordPdf = wordFileName
dest_pathWord = os.path.join(newFolderPath, wordPdf)
shutil.move(doc_path, dest_pathWord)
#print(f"Imzali Report File has been moved to its destination. src: {doc_path}, destination: {dest_pathWord} ")
elif 'GENERATED-REPORT' not in doc_path and '-IMZALIRAPOR' not in doc_path and '@' not in doc_path:
#Here pdf convertion happens.
doc2pdf(doc_path, pdf_path)
#Create subFolder based on PDF file.
createFolderPath = os.path.split(pdf_path)[-1].split(".")[0]
newFolderPath = self.createFolder(baseAd, createFolderPath)
#print(f"New folder has been created: {newFolderPath}")
pdfFileName = os.path.split(pdf_path)[-1]
src_pdf = pdf_path
dest_pathPdf = os.path.join(newFolderPath, pdfFileName)
shutil.move(src_pdf, dest_pathPdf)
#print(f"File has been moved to its destination. src: {src_pdf}, destination: {dest_pathPdf} ")
#print('Doc path', doc_path)
wordFileName = os.path.split(doc_path)[-1]
wordPdf = wordFileName
dest_pathWord = os.path.join(newFolderPath, wordPdf)
shutil.move(doc_path, dest_pathWord)
#print(f"File has been moved to its destination. src: {doc_path}, destination: {dest_pathWord} ")
except Exception as e:
createText('exceptionHasOccured...', f'{e}')
elif event.event_type == 'created' and '@' in currentFileNameSplitted and not self.is_temporary_file(
event.src_path):
doc_path = event.src_path
folderPath = currentFileNameSplitted.split("@")[1].split(".")[0]
try:
baseFolderPath = os.path.split(doc_path)[:-1][0]
#print(f"Attachments detected: {doc_path}, {currentFileNameSplitted}, {baseFolderPath}")
dest_path = os.path.join(baseFolderPath, folderPath, currentFileNameSplitted)
try:
shutil.move(doc_path, dest_path)
except:
try:
self.createFolderAtt(os.path.join(baseFolderPath, folderPath))
shutil.move(doc_path, dest_path)
except Exception as e:
createText('InnerAttachmentError', f'{e}')
except Exception as e:
createText('outerAttachmentErrorOccured', f'{e}')
except Exception as e:
createText('outerAllExceptionasOccured', f'{e}')
if __name__ == '__main__':
directory_to_watch = r"C:\inetpub\wwwroot\utkuploads"
event_handler = DocFileHandler()
observer = Observer()
observer.schedule(event_handler, path=directory_to_watch, recursive=False)
observer.start()
try:
while True:
pass
except KeyboardInterrupt:
observer.stop()
observer.join()
SaveAscall most of the time. MAYBE you could gain some time by running several of those at once, but that may just be a bottleneck.