File size: 5,123 Bytes
5c9f913
 
 
 
 
 
 
 
 
 
6f984e1
5c9f913
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c17eebd
5c9f913
 
 
af7db15
5c9f913
 
7987c67
5c9f913
 
 
 
 
 
 
 
 
 
 
6f984e1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5c9f913
6f984e1
5c9f913
 
 
 
 
 
 
6f984e1
5c9f913
6f984e1
 
5c9f913
 
6f984e1
5c9f913
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6f984e1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import os
import streamlit as st
from paddleocr import PaddleOCR
from langchain_groq import ChatGroq
from langchain.output_parsers import PydanticOutputParser
from langchain_core.prompts import PromptTemplate
from pydantic import BaseModel, Field
import fitz  
import json
from PIL import Image

ocr = PaddleOCR(use_angle_cls=True, lang='es')

st.set_page_config(layout="wide")

class CarInfoEntity(BaseModel):
    dealer_name: str = Field(description="Nombre del concesionario o empresa.")
    dealer_address: str = Field(description="Direcci贸n f铆sica del concesionario.")
    tax_id: str = Field(description="N煤mero de identificaci贸n fiscal del concesionario.")
    contact_phone: str = Field(description="N煤mero de tel茅fono principal para contactar con el concesionario.")
    contact_fax: str = Field(description="N煤mero de fax del concesionario.")
    contact_email: str = Field(description="Direcci贸n de correo electr贸nico para consultas.")
    website_url: str = Field(description="Sitio web oficial del concesionario.")
    operating_hours: str = Field(description="Horario habitual de atenci贸n del concesionario.")
    saturday_hours: str = Field(description="Horario de atenci贸n espec铆fico para los s谩bados.")
    order_date: str = Field(description="Fecha en que se realiz贸 el pedido.")
    order_number: str = Field(description="Identificador 煤nico del pedido.")
    sales_rep: str = Field(description="Nombre del vendedor que maneja la transacci贸n.")
    customer_full_name: str = Field(description="Nombre completo del comprador.")
    customer_address: str = Field(description="Direcci贸n del comprador.")
    customer_city: str = Field(description="Ciudad donde reside el comprador.")
    customer_postal_code: str = Field(description="C贸digo postal de la direcci贸n del comprador.")
    customer_province: str = Field(description="Provincia donde se encuentra el comprador.")
    customer_id: str = Field(description="N煤mero de identificaci贸n del comprador (NIF).")
    customer_phone: str = Field(description="N煤mero de tel茅fono del comprador.")
    vehicle_description: str = Field(description="Descripci贸n del veh铆culo que se est谩 comprando, incluyendo marca, modelo y a帽o.")
    vehicle_color: str = Field(description="Color del veh铆culo.")
    vehicle_price: str = Field(description="Precio total del veh铆culo, incluyendo impuestos.")

model = ChatGroq(
    model="llama-3.3-70b-versatile",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2
)



entity = ['dealer_name', 'dealer_address', 'tax_id', 'contact_phone', 'contact_fax', 'contact_email', 'website_url', 
          'operating_hours', 'saturday_hours', 'order_date', 'order_number', 'sales_rep', 
          'customer_full_name', 'customer_address', 'customer_city', 'customer_postal_code', 
          'customer_province', 'customer_id','customer_phone', 'vehicle_description','vehicle_color','vehicle_price']

# Streamlit App
st.title("Vehicle Information Extractor")
st.write("Upload a PDF file to extract vehicle information.")

uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
use_default = st.checkbox("Use Default Pdf")

doc = None

if use_default:
    default_pdf_path = "pedido V.O.pdf"
    if os.path.exists(default_pdf_path):
        print("Present")
        doc = fitz.open(default_pdf_path)
        st.write("Using default PDF:")
    else:
        st.error("Default PDF not found.")

else:
    if uploaded_file is not None:
        with open("temp.pdf", "wb") as f:
            f.write(uploaded_file.read())
        doc = fitz.open("temp.pdf")
        st.write("Uploaded PDF:")

if doc:
    col1, col2 = st.columns(2)

    with col1:
        for page_num in range(len(doc)):
            page = doc.load_page(page_num)
            pix = page.get_pixmap()
            img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
            st.image(img, caption=f"Page {page_num + 1}", use_column_width=True)

    # Perform OCR
    ocr_result = ocr.ocr(default_pdf_path if use_default else "temp.pdf")

    extracted_text = []
    for page in ocr_result:
        for result in page:
            text = result[1][0]
            extracted_text.append(text)

    all_text = " ".join(extracted_text)

    prompt_text = """Task: Analyze the {all_text} and find out given entity value:{entity} from the {all_text}:

    Output Format: A table with the entity and value. First column contains the {entity} and second column contains the value fetched from the {all_text}.
                   
    Do not include any additional explanations or unnecessary details. 
    {format_instructions}"""

    parser = PydanticOutputParser(pydantic_object=CarInfoEntity)

    prompt = PromptTemplate(
        template=prompt_text,
        input_variables=["all_text", "entity"],
        partial_variables={"format_instructions": parser.get_format_instructions()},
    )

    chain = prompt | model | parser

    output = chain.invoke({"all_text": all_text, "entity": entity})

    with col2:
        st.write("Extracted Vehicle Information (Table):")
        st.table(output)