from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import TextLoader
from langchain.vectorstores import DocArrayInMemorySearch
from IPython.display import display, Markdown
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings, HuggingFaceInstructEmbeddings
from langchain.vectorstores import FAISS
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain.indexes import VectorstoreIndexCreator
from langchain_experimental.agents.agent_toolkits.csv.base import create_csv_agent
from langchain.agents.agent_types import AgentType
from langchain.memory import ConversationBufferMemory
from io import BytesIO
import tiktoken
import time
import textwrap
import os
import pandas as pd
import requests
import streamlit as st
import json
from langchain.text_splitter import CharacterTextSplitter
from langchain.document_loaders import PyPDFLoader
B_INST, E_INST = “[INST]”, “[/INST]”
B_SYS, E_SYS = “<>\n”, “\n<>\n\n”
DEFAULT_SYSTEM_PROMPT = “”"
“”"
def wrap_text_preserve_newlines(text, width=110):
# Split the input text into lines based on newline characters
lines = text.split(‘\n’)
# Wrap each line individually
wrapped_lines = [textwrap.fill(line, width=width) for line in lines]
# Join the wrapped lines back together using newline characters
wrapped_text = '\n'.join(wrapped_lines)
return wrapped_text
def process_llm_response(llm_response):
return(wrap_text_preserve_newlines(llm_response[‘result’]))
#print(‘\n\nSources:’)
#for source in llm_response[“source_documents”]:
#print(source.metadata[‘source’])
def get_prompt(instruction, new_result_str_modified, new_system_prompt=DEFAULT_SYSTEM_PROMPT):
SYSTEM_PROMPT = B_SYS + new_system_prompt + new_result_str_modified + E_SYS
prompt_template = B_INST + SYSTEM_PROMPT + instruction + E_INST
return prompt_template
def extract_key_value_pairs(data, parent_key=‘’, sep=‘_’):
items =
if isinstance(data, dict):
for key, value in data.items():
new_key = f"{parent_key}{sep}{key}" if parent_key else key
items.extend(extract_key_value_pairs(value, new_key, sep=sep))
else:
items.append((parent_key, data))
return items
Function to extract key-value pairs from JSON
def extract_child_json(json_file):
if json_file:
# Read the content of the uploaded JSON file
file_contents = json_file.read()
try:
# Decode the content using 'utf-8' or 'latin-1' (try both)
decoded_content = file_contents.decode("utf-8")
except UnicodeDecodeError:
decoded_content = file_contents.decode("latin-1")
try:
# Load the JSON content into a Python dictionary
json_obj = json.loads(decoded_content)
# Return the JSON content as it is without any filtering
child_json = json_obj
print(type(child_json))
return child_json
except json.JSONDecodeError as e:
st.error("Error: Invalid JSON file")
return None
def process_pdf(pdf_file):
if pdf_file:
# Read the content of the uploaded PDF file
pdf_contents = pdf_file.read()
# Save the uploaded PDF file to a temporary location
with open("temp.pdf", "wb") as temp_pdf:
temp_pdf.write(pdf_contents)
# Process the uploaded PDF using Langchain operations
txt_file_path = "temp.pdf" # Provide the path to the temporary PDF file
loader = PyPDFLoader(file_path=txt_file_path)
data = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
processed_data = text_splitter.split_documents(data)
# Delete the temporary PDF file after processing
os.remove("temp.pdf")
return processed_data
def process_with_nanonet(pdf_file):
if pdf_file:
pdf_contents = pdf_file.read()
print(“Here’s your PDF Content:”)
print(pdf_contents)
with open(“temp.pdf”, “wb”) as temp_pdf:
temp_pdf.write(pdf_contents)
#url = ''
url = ''
data = {'file': open("temp.pdf", 'rb')}
response = requests.post(url, auth=requests.auth.HTTPBasicAuth('', ''), files=data)
response_json = response.json()
for page in response_json['result']:
if page['page'] == 0:
required_json = page['request_metadata']
print(required_json)
os.remove("temp.pdf")
return required_json
def get_processing_type():
return “openai”
def main():
processing_type = get_processing_type()
st.title(“Jules BL Checker”)
available_json_files = [os.path.splitext(filename)[0] for filename in os.listdir('.') if filename.endswith('.json')]
# Create a dropdown for the user to select a JSON file without displaying the '.json' extension
selected_json_file_stem = st.selectbox("Select BL Instructions:", available_json_files)
# Get the full filename by adding the '.json' extension back
selected_json_filename = selected_json_file_stem + ".json"
st.write("Upload BL Draft in PDF:")
pdf_file = st.file_uploader("", accept_multiple_files=False, type='pdf')
# Initialize new_result_str_modified before the PROCESS button
new_result_str_modified = ""
show_instructions = st.checkbox("Show BL Instructions", value=False)
show_filled_template = st.checkbox("Show the Processed BL Draft", value=False)
if 'show_feedback' not in st.session_state:
st.session_state.show_feedback = False
# Add PROCESS button
if st.button("PROCESS"):
child_json = None
processed_text = None
# Attempt to read the JSON file safely
try:
with open(selected_json_filename, 'r') as json_file:
child_json = json.load(json_file)
except Exception as e:
st.error(f"An error occurred while reading the JSON file: {e}")
# Process PDF if a file was uploaded
if pdf_file:
file_buffer = BytesIO(pdf_file.read())
# Reset the pointer of the original pdf_file, if needed
pdf_file.seek(0)
processed_text = process_pdf(pdf_file)
if child_json:
# Extract key-value pairs if child_json is loaded successfully
# Display BL Instructions if the checkbox is checked
if show_instructions:
st.write("Here's the BL Instructions:")
st.json(child_json)
key_value_pairs = extract_key_value_pairs(child_json)
result = {key: value for key, value in key_value_pairs}
result_str = str(result)
new_result_str_modified = result_str.replace('{', '{{').replace('}', '}}')
if processed_text:
api_key = ""
os.environ["OPENAI_API_KEY"] = api_key
if processing_type == "openai":
t1 = time.perf_counter()
# Display the processed text in Streamlit
#st.write("Processed text:")
#st.write(processed_text) # Display processed text
# Set the API key as an environment variable
with st.spinner("Processing your request..."):
embeddings = OpenAIEmbeddings()
vectorstore = FAISS.from_documents(processed_text, embedding=embeddings)
sys_prompt = """\ You're a dictionary generator. Fill up the below details of the template from the document given.
And print the filled template. An example of filled up template is as shown - ."""
instruction = """CONTEXT:/n/n {context}/n
Response: Display the filled template """
#get_prompt(instruction, new_result_str_modified, sys_prompt)
from langchain.prompts import PromptTemplate
prompt_template = get_prompt(instruction, new_result_str_modified, sys_prompt)
prompt = PromptTemplate(
template=prompt_template, input_variables=["context"]
)
print(prompt)
chain_type_kwargs = {"prompt": prompt}
llm = ChatOpenAI(temperature=0.7, model_name='gpt-4-turbo-preview')
memory = ConversationBufferMemory(
memory_key='chat_history', return_messages=True)
conversation_chain = RetrievalQA.from_chain_type(
llm=llm,
chain_type="stuff",
chain_type_kwargs=chain_type_kwargs,
retriever=vectorstore.as_retriever(),
memory=memory
)
query = 'display the filled template'
final_result = conversation_chain({'query': query})
response_pdf = process_llm_response(final_result)
t2 = time.perf_counter()
print(f"processing with openai took {t2-t1} seconds")
elif processing_type == "nanonet":
file_buffer.seek(0)
with st.spinner("Processing your request..."):
t1 = time.perf_counter()
response_pdf = process_with_nanonet(file_buffer)
t2 = time.perf_counter()
print(f"processing with nanonets took {t2-t1} seconds")
print('\n\nFilled Template:', response_pdf)
if response_pdf:
st.success("Processing complete!")
if show_filled_template:
st.write("Here's the Processed BL Draft:")
st.write(response_pdf)
from langchain.chains import LLMChain
from langchain_core.prompts import PromptTemplate
prompt_template = "Compare the dictionary {result_pdf} with {result_app} and print the mismatch values in JSON format(Example for keys and values in JSON - 'PRE_CARRIAGE_BY': ['BARCELONA', ''], 'OCEAN_VESSEL_VOYAGE_NO': ['AFIF 020E', '348E'], etc). Output SHOULD be a VALID JSON. REMEMBER not to display code in the answer, if you want you can process the code internally and produce the mismatch by comparing each and every key."
prompt = PromptTemplate(
input_variables=["result_pdf", "result_app"], template=prompt_template
)
llm = ChatOpenAI(temperature=0.7, model_name='gpt-4-turbo-preview')
llm = LLMChain(llm=llm, prompt=prompt)
with st.spinner("Generating response..."):
# Generation of the comparison between the PDF contents and the JSON instructions
t3 = time.perf_counter()
completion = llm.predict(result_pdf=response_pdf, result_app=new_result_str_modified)
t4 = time.perf_counter()
print(f"comparison prompt took {t4-t3} seconds")
# Display the completion outcome
print(completion)
st.success("Response generated!")
# Remove the "json " prefix from the JSON string
completion = completion.strip('`').replace('json\n', '').strip()
# Parse JSON data into a Python dictionary
json_data = json.loads(completion)
# Function to flatten the nested JSON and capture mismatches
def extract_mismatches(data, key=None):
mismatches = []
for k, v in data.items():
# Construct a key path to identify nested items
new_key = f"{key}.{k}" if key else k
# Check if the value is a list with more than one unique value (mismatch)
if isinstance(v, list) and len(set(map(str, v))) > 1:
mismatches.append({
'Field': new_key,
'Expected Value': v[0],
'Actual Value': v[1]
})
# If the value is another dictionary, recurse into it
elif isinstance(v, dict):
mismatches.extend(extract_mismatches(v, new_key))
return mismatches
def get_corrected_llm_response(completion, result_pdf, result_app, user_suggestions):
# Generate a response using the LLMChain with the user's suggestions
user_corrected_prompt_template = "You're a response corrector. We earlier had you compute the mismatches {completion} comparing {result_pdf} and {result_app}. You've to now take the suggestions of the user: {user_suggestions} and provide the response mismatch (corrected considering user's suggestions). REMEMBER not to display code in the answer, if you want you can process the code internally and produce the corrected mismatch."
prompt = PromptTemplate(
input_variables=["completion", "result_pdf", "result_app", "user_suggestions"],
template=user_corrected_prompt_template
)
llm = ChatOpenAI(temperature=0.7, model_name='gpt-4-turbo-preview')
llm = LLMChain(llm=llm, prompt=prompt)
llm_response = llm.predict(completion=str(completion), result_pdf=response_pdf, result_app=new_result_str_modified, user_suggestions=user_suggestions)
return llm_response
# Using the function to extract mismatches
mismatches = extract_mismatches(json_data)
# Convert the mismatches into a DataFrame
df = pd.DataFrame(mismatches)
if not df.empty:
st.write("Hey, these are the mismatches:")
st.dataframe(df)
else:
# If the DataFrame is empty, print the message
st.write("Hey, no mismatch found!")
#user_message = st.text_input("Type your message:", key="user_input")
if 'corrected_llm_response' not in st.session_state:
st.session_state.corrected_llm_response = None
with st.form(key='suggestions_form'):
user_suggestions = st.text_area("Any suggestions that can improve response?", key="user_suggestions")
submit_button = st.form_submit_button(label='Submit Suggestions')
# When the user submits a message, call the LLM
if submit_button:
st.session_state.corrected_llm_response = get_corrected_llm_response(str(completion), result_pdf, result_app, user_suggestions)
if st.session_state.corrected_llm_response:
# Remove the "json " prefix from the JSON string
st.session_state.corrected_llm_response = st.session_state.corrected_llm_response.strip('`').replace('json\n', '').strip()
# Parse JSON data into a Python dictionary
corrected_json_data = json.loads(st.session_state.corrected_llm_response)
corrected_mismatches = extract_mismatches(corrected_json_data)
df = pd.DataFrame(corrected_mismatches)
if not df.empty:
st.write("Hey, these are the corrected mismatches:")
st.dataframe(df)
else:
# If the DataFrame is empty, print the message
st.write("Hey, no mismatch found on corrections!")
st.session_state.show_feedback = True
from trubrics.integrations.streamlit import FeedbackCollector
collector = FeedbackCollector(
project="default",
email="",
password="",
)
# Display the DataFrame using Streamlit
if st.session_state.show_feedback:
collector.st_feedback(
component="default",
feedback_type="thumbs",
model="gpt-3.5-turbo",
prompt_id=None, # see prompts to log prompts and model generations
open_feedback_label='[Optional] Provide additional feedback'
)
if name == “main”:
main() → here’s the code of my streamlit app, on hit of “submit suggestions”, the app reruns totally with no response – even after involving session state - can somebody help here?