Hey guys, I can’t get the streamlit pdf query tool to work, any ideas?
I’ve been working through this medium article trying to get this working:
Index Error:
File "c:\Users\pedro\anaconda3\envs\textgen2\lib\site-packages\streamlit\runtime\scriptrunner\script_runner.py", line 534, in _run_script
exec(code, module.__dict__)
File "C:\Users\pedro\OneDrive\Documents\Chat2db\AI-PredictiveMaintenance\src\Notebooks\pwcfinal.py", line 64, in <module>
st.write(get_llm_response(form_input))
File "C:\Users\pedro\OneDrive\Documents\Chat2db\AI-PredictiveMaintenance\src\Notebooks\pwcfinal.py", line 48, in get_llm_response
vectordb = load_chunk_persist_pdf()
File "C:\Users\pedro\OneDrive\Documents\Chat2db\AI-PredictiveMaintenance\src\Notebooks\pwcfinal.py", line 31, in load_chunk_persist_pdf
vectordb = Chroma.from_documents(
File "c:\Users\pedro\anaconda3\envs\textgen2\lib\site-packages\langchain\vectorstores\chroma.py", line 646, in from_documents
return cls.from_texts(
File "c:\Users\pedro\anaconda3\envs\textgen2\lib\site-packages\langchain\vectorstores\chroma.py", line 610, in from_texts
chroma_collection.add_texts(texts=texts, metadatas=metadatas, ids=ids)
File "c:\Users\pedro\anaconda3\envs\textgen2\lib\site-packages\langchain\vectorstores\chroma.py", line 237, in add_texts
self._collection.upsert(
File "c:\Users\pedro\anaconda3\envs\textgen2\lib\site-packages\chromadb\api\models\Collection.py", line 295, in upsert
ids, embeddings, metadatas, documents = self._validate_embedding_set(
File "c:\Users\pedro\anaconda3\envs\textgen2\lib\site-packages\chromadb\api\models\Collection.py", line 347, in _validate_embedding_set
ids = validate_ids(maybe_cast_one_to_many(ids))
File "c:\Users\pedro\anaconda3\envs\textgen2\lib\site-packages\chromadb\api\types.py", line 99, in maybe_cast_one_to_many
if isinstance(target[0], (int, float)):
Code:
import os
from dotenv import load_dotenv
import streamlit as st
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.chains.question_answering import load_qa_chain
from langchain.chat_models import ChatOpenAI
from langchain.vectorstores import Chroma
import chromadb
from langchain.llms import LlamaCpp
def load_chunk_persist_pdf() -> Chroma:
pdf_folder_path = r"C:\Users\pedro\OneDrive\Desktop\Data"
documents = []
for file in os.listdir(pdf_folder_path):
if file.endswith('.pdf'):
pdf_path = os.path.join(pdf_folder_path, file)
loader = PyPDFLoader(pdf_path)
documents.extend(loader.load())
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=10)
chunked_documents = text_splitter.split_documents(documents)
client = chromadb.Client()
if client.list_collections():
climate_data = client.create_collection("climate_data")
else:
print("Collection already exists")
vectordb = Chroma.from_documents(
documents=chunked_documents,
embedding=OpenAIEmbeddings(),
persist_directory="C:\chroma_store"
)
vectordb.persist()
return vectordb
def create_agent_chain():
llm = (LlamaCpp(model_path=r"C:\Users\pedro\anaconda3\envs\textgen2\Lib\site-packages\llama_index\llms\HuggingFaceLLM\TheBloke_Llama-2-7b-Chat-GGUF\llama-2-7b-chat.Q8_0.gguf",
temperature=0))
chain = load_qa_chain(llm, chain_type="stuff")
return chain
def get_llm_response(query):
vectordb = load_chunk_persist_pdf()
chain = create_agent_chain()
matching_docs = vectordb.similarity_search(query)
answer = chain.run(input_documents=matching_docs, question=query)
return answer
# Streamlit UI
# ===============
st.set_page_config(page_title="Doc Searcher", page_icon=":robot:")
st.header("Query PDF Source")
form_input = st.text_input('Enter Query')
submit = st.button("Generate")
if submit:
st.write(get_llm_response(form_input))