processing hundreds of csv files one row at a time for embedding, upload to pinecone using OpenAI embeddings

67 Views Asked by At

This is my current code which works for a while and then throws an error of "can't start a new thread." Tried both threading and multi-processing and both cause this error eventually.

def process_file(file_path):
    print(f'file: {file_path}')
    def process_row(row):
    text = row['text']
    row2data = row['row2data']
    year = row['year']
    group_id = row['group_id']
    docs = embedder(text, text, year, group_id)
    my_index = pc_store.from_documents(docs, embeddings, index_name=PINECONE_INDEX_NAME)
    with open(file_path, 'r') as file:
        reader = csv.DictReader(file)
        for row in reader:
            process_row(row)

if __name__ == '__main__':
    file_paths = ['file1', 'file2', 'file3']
    processes = []

    for file_path in file_paths:
        p = Process(target=process_file, args=(file_path,))
        p.start()
        processes.append(p)

    for p in processes:
        p.join()

Here is the stack trace of the error:

Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/multiprocessing/pool.py", line 215, in __init__
    self._repopulate_pool()
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/multiprocessing/pool.py", line 306, in _repopulate_pool
    return self._repopulate_pool_static(self._ctx, self.Process,
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/multiprocessing/pool.py", line 329, in _repopulate_pool_static
    w.start()
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/multiprocessing/dummy/__init__.py", line 51, in start
    threading.Thread.start(self)
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/threading.py", line 971, in start
    _start_new_thread(self._bootstrap, ())
RuntimeError: can't start new thread
0

There are 0 best solutions below