This is my current code which works for a while and then throws an error of "can't start a new thread." Tried both threading and multi-processing and both cause this error eventually.
def process_file(file_path):
print(f'file: {file_path}')
def process_row(row):
text = row['text']
row2data = row['row2data']
year = row['year']
group_id = row['group_id']
docs = embedder(text, text, year, group_id)
my_index = pc_store.from_documents(docs, embeddings, index_name=PINECONE_INDEX_NAME)
with open(file_path, 'r') as file:
reader = csv.DictReader(file)
for row in reader:
process_row(row)
if __name__ == '__main__':
file_paths = ['file1', 'file2', 'file3']
processes = []
for file_path in file_paths:
p = Process(target=process_file, args=(file_path,))
p.start()
processes.append(p)
for p in processes:
p.join()
Here is the stack trace of the error:
Traceback (most recent call last):
File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/multiprocessing/pool.py", line 215, in __init__
self._repopulate_pool()
File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/multiprocessing/pool.py", line 306, in _repopulate_pool
return self._repopulate_pool_static(self._ctx, self.Process,
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/multiprocessing/pool.py", line 329, in _repopulate_pool_static
w.start()
File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/multiprocessing/dummy/__init__.py", line 51, in start
threading.Thread.start(self)
File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/threading.py", line 971, in start
_start_new_thread(self._bootstrap, ())
RuntimeError: can't start new thread
process_rowthat causes this issue with a stack trace?multiprocessing.dummy.poolwhich does not show up in your code. The "can't start new thread" error is likely due to starting too many concurrent threads (searchulimit). If you are creating a thread pool each in a large process pool, your number of threads will multiply very quickly (past what will likely give a performance benefit I might add).multiprocessing.poolwith 10 processes rather than creating a new process for each file.