The code runs well in Pycharms, but breaks after compiled using Pyinstaller. All libraries should be included, I checked but may miss something.
The blocks of code that cause problem should try call function load_documents which in turn call function load_single_document. But something went wrong in this process. There is some kind of loop happened that causes the program to repeatedly tries to call the functions and crashes the PC.
def load_documents(source_dir: str, ignored_files: List[str] = None) -> List[Document]:
# """
# Loads all documents from the source documents directory, ignoring specified files
# """
if ignored_files is None:
ignored_files = []
print("1")
all_files = []
print("2")
for ext in LOADER_MAPPING:
print("3")
all_files.extend(
glob.glob(os.path.join(source_dir, f"**/*{ext.lower()}"), recursive=True)
)
print("4")
all_files.extend(
glob.glob(os.path.join(source_dir, f"**/*{ext.upper()}"), recursive=True)
)
filtered_files = [file_path for file_path in all_files if file_path not in ignored_files]
print("5")
with pl(processes=os.cpu_count()) as pool:
print("6")
results = []
print("7")
with tqdm(total=len(filtered_files), desc='Loading new documents', ncols=80) as pbar:
print("A")
try:
for i, docs in enumerate(pool.imap_unordered(load_single_document, filtered_files)):
print("B")
results.extend(docs)
print("C")
pbar.update()
except Exception as e:
print(f"An error occurred2: {str(e)}")
print("8")
return results
def process_documents(ignored_files: List[str] = None) -> List[Document]:
"""
Load documents and split in chunks
"""
if ignored_files is None:
ignored_files = []
print(f"Loading documents from {source_directory}")
documents = load_documents(source_directory, ignored_files)
print(f"12")
if not documents:
print("No new documents to load")
sys.exit(0)
print(f"Loaded {len(documents)} new documents from {source_directory}")
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
texts = text_splitter.split_documents(documents)
print(f"Split into {len(texts)} chunks of text (max. {chunk_size} tokens each)")
return texts
def does_vectorstore_exist(persist_directory: str, embeddings: HuggingFaceEmbeddings) -> bool:
"""
Checks if vectorstore exists
"""
db = Chroma(persist_directory=persist_directory, embedding_function=embeddings)
if not db.get()['documents']:
return False
return True
def main():
# Create embeddings
embeddings = HuggingFaceEmbeddings(model_name=embeddings_model_name)
# Chroma client
chroma_client = chromadb.PersistentClient(settings=CHROMA_SETTINGS , path=persist_directory)
if does_vectorstore_exist(persist_directory, embeddings):
# Update and store locally vectorstore
print(f"Appending to existing vectorstore at {persist_directory}")
db = Chroma(persist_directory=persist_directory, embedding_function=embeddings, client_settings=CHROMA_SETTINGS, client=chroma_client)
collection = db.get()
texts = process_documents([metadata['source'] for metadata in collection['metadatas']])
print(f"Creating embeddings. May take some minutes...")
db.add_documents(texts)
else:
# Create and store locally vectorstore
print("Creating new vectorstore")
texts = process_documents()
print(f"Creating embeddings. May take some minutes...")
db = Chroma.from_documents(texts, embeddings, persist_directory=persist_directory, client_settings=CHROMA_SETTINGS, client=chroma_client)
db.persist()
db = None
print(f"Ingestion complete! You can now run privateGPT.py to query your documents")
if __name__ == "__main__":
main()
I tried to insert a bunch of print() commands to see where does the code break. I discovered that it breaks after print("A") i.e. around this linefor i, docs in enumerate(pool.imap_unordered(load_single_document, filtered_files)): or when it tries to call function def load_single_document(file_path: str) -> List[Document]:
This is the log when it went wrong:
3
4
5
6
7
Loading new documents: 0%| | 0/2 [00:00<?, ?it/s]A
Creating new vectorstore
Creating new vectorstore
Creating new vectorstore
Creating new vectorstore
Creating new vectorstore
Creating new vectorstore
Creating new vectorstore
Creating new vectorstore
Creating new vectorstore
Creating new vectorstore
Creating new vectorstore
Creating new vectorstore
Creating new vectorstore
Creating new vectorstore
Creating new vectorstore
Creating new vectorstore
Loading documents from source_documents
Loading documents from source_documents
Skips a bunches of 1 2 3 4 5 etc.
Loading new documents: 0%| | 0/2 [00:00<?, ?it/s]A
6
7
Loading new documents: 0%| | 0/2 [00:00<?, ?it/s]A
6
7
Loading new documents: 0%| | 0/2 [00:00<?, ?it/s]A
6
7
Loading new documents: 0%| | 0/2 [00:00<?, ?it/s]A
6
7
The program doesn’t stop but keeps trying to call the functions until it crashes the computer.
Anyone knows what’s going on?
>Solution :
you need to add multiprocessing.freeze_support() to your code if it is going to be converted to an exe.
if __name__ == "__main__":
multiprocessing.freeze_support()
main()