Unable to open pdf files in chrome when i tried to browse and upload it from local

Summary

I developed a streamlit programme that can import PDF files and extract information from them. When I tried using the streamlit code, which was running successfully in the local Jupyter, the streamlit app was unable to open the PDF file and returned an error like “PdfiumError: Failed to load document (PDFium: File access error).”

def convert_pdf_to_images(file_path, scale=300 / 72):
    pdf_file = pdfium.PdfDocument(file_path)

    page_indices = [i for i in range(len(pdf_file))]

    renderer = pdf_file.render(
        pdfium.PdfBitmap.to_pil,
        page_indices=page_indices,
        scale=scale,
    )

    final_images = []

    for i, image in zip(page_indices, renderer):
        image_byte_array = BytesIO()
        image.save(image_byte_array, format='jpeg', optimize=True)
        image_byte_array = image_byte_array.getvalue()
        final_images.append(dict({i: image_byte_array}))

    return final_images

Expected behavior:

It must upload the pdf file

Actual behavior:

It is returning error like below

Can anyone please help me with this issue?

Hey @nithinreddyy
I am also trying to run the same program and facing the same issue.
Please let me know i you have solved it yet or not…

I will be very thankfull

The code is running in normal python code, but unable to upload files when using streamlit. Waiting for another experts to answer this question.

Can try to this

with tempfile.NamedTemporaryFile(delete=False) as temp_file:
temp_file.write(file.getvalue())

I made a thing that will give most common words in a pdf so here is the code
And the app is able to read pdf file
And i used PyPDF2 in my code

import streamlit as st
from PyPDF2 import PdfReader
from collections import Counter
import re
def extract_text_from_pdf(pdf_file):
    text = ""
    pdf_reader = PdfReader(pdf_file)
    for page in pdf_reader.pages:
        text += page.extract_text()
    return text
def get_most_common_words(text, num_words=10):
    words = re.findall(r'\b\w+\b', text.lower())
    word_count = Counter(words)
    return word_count.most_common(num_words)
def main():
    st.title("PDF Word Counter")
    uploaded_file = st.file_uploader("Upload a PDF file", type=["pdf"])
    if uploaded_file is not None:
        st.write("Uploaded PDF file:", uploaded_file.name)
        pdf_text = extract_text_from_pdf(uploaded_file)
        most_common_words = get_most_common_words(pdf_text)
        st.write("Most common words:")
        for word, count in most_common_words:
            st.write(f"{word}: {count}")
if __name__ == "__main__":
    main()

Hope this will work for you

Here is a code to convert pdf pages into Images

import streamlit as st
import pdf2image
from io import BytesIO
def convert_pdf_to_images(pdf_bytes):
    images = pdf2image.convert_from_bytes(pdf_bytes)
    return images
def main():
    st.title("PDF to Images Converter")
    pdf_uploaded = st.file_uploader("Select a PDF file", type="pdf")
    if pdf_uploaded is not None:
        if pdf_uploaded.type == "application/pdf":
            images = convert_pdf_to_images(pdf_uploaded.read())
            for i, image in enumerate(images):
                st.image(image, caption=f"Page {i+1}", use_column_width=True)
                buf = BytesIO()
                image.save(buf, format="JPEG")
                byte_im = buf.getvalue()
                st.download_button("Download", data=byte_im, file_name=f"Image_{i}.png")
if __name__ == "__main__":
    main()

Inspired from tomjohnh

here is the entire code that works:

from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from dotenv import load_dotenv
from pytesseract import image_to_string
from PIL import Image
from io import BytesIO
import fitz
import streamlit as st
import multiprocessing
from tempfile import NamedTemporaryFile
import pandas as pd
import json

load_dotenv()

1. Convert PDF file into images via pypdfium2

def convert_pdf_to_images(pdf_path):
pdf_document = fitz.open(pdf_path)
images =

for page_num in range(pdf_document.page_count):
    page = pdf_document[page_num]
    image_list = page.get_images(full=True)
    for img_index, img in enumerate(image_list):
        xref = img[0]
        base_image = pdf_document.extract_image(xref)
        image = Image.open(BytesIO(base_image["image"]))

        images.append(image)

pdf_document.close()
return images

2. Extract text from images via pytesseract

def extract_text_from_img(image_list):

image_content = []

for index, image in enumerate(image_list):

    raw_text = str(image_to_string(image))
    image_content.append(raw_text)

return "\n".join(image_content)

def extract_content_from_url(url: str):
images_list = convert_pdf_to_images(url)
text_with_pytesseract = extract_text_from_img(images_list)

return text_with_pytesseract

3. Extract structured info from text via LLM

def extract_structured_data(content: str, data_points):
llm = ChatOpenAI(temperature=0, model=“gpt-3.5-turbo-0613”)
template = “”"
You are an expert admin people who will extract core information from documents

{content}

Above is the content; please try to extract all data points from the content above 
and export in a JSON array format:
{data_points}

Now please extract details from the content  and export in a JSON array format, 
return ONLY the JSON array:
"""

prompt = PromptTemplate(
    input_variables=["content", "data_points"],
    template=template,
)

chain = LLMChain(llm=llm, prompt=prompt)

results = chain.run(content=content, data_points=data_points)

return results

4. Send data to make.com via webhook

5. Streamlit app

def main():
default_data_points = “”“{
“invoice_item”: “what is the item that charged”,
“Amount”: “how much does the invoice item cost in total”,
“Company_name”: “company that issued the invoice”,
“invoice_date”: “when was the invoice issued”,
}”“”

st.set_page_config(page_title="Doc extraction", page_icon=":bird:")

st.header("Doc extraction :bird:")

data_points = st.text_area(
    "Data points", value=default_data_points, height=170)

uploaded_files = st.file_uploader(
    "upload PDFs", accept_multiple_files=True)

if uploaded_files is not None and data_points is not None:
    results = []
    for file in uploaded_files:
        with NamedTemporaryFile(delete=False, suffix='.csv') as f:
            f.write(file.getbuffer())
            content = extract_content_from_url(f.name)
            print(content)
            data = extract_structured_data(content, data_points)
            json_data = json.loads(data)
            if isinstance(json_data, list):
                results.extend(json_data)  # Use extend() for lists
            else:
                results.append(json_data)  # Wrap the dict in a list

    if len(results) > 0:
        try:
            df = pd.DataFrame(results)
            st.subheader("Results")
            st.data_editor(df)
            
        except Exception as e:
            st.error(
                f"An error occurred while creating the DataFrame: {e}")
            st.write(results)  # Print the data to see its content

if name == ‘main’:
multiprocessing.freeze_support()
main()

what i did was use PyMuPdf instead of pdfium an d some other changes. try it out. I have removed the entire xero integration section. I do not need that.