Spaces:
Running
Running
Added Files
Browse files- app.py +209 -0
- requirements.txt +10 -0
- setup.sh +2 -0
app.py
ADDED
@@ -0,0 +1,209 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from PIL import Image
|
3 |
+
from pytesseract import pytesseract
|
4 |
+
import PyPDF2
|
5 |
+
import enum
|
6 |
+
import os
|
7 |
+
import re
|
8 |
+
from collections import defaultdict
|
9 |
+
import folium
|
10 |
+
from streamlit_folium import st_folium
|
11 |
+
from geopy.geocoders import Nominatim
|
12 |
+
from geopy.exc import GeocoderTimedOut
|
13 |
+
import wikipedia
|
14 |
+
from transformers import pipeline
|
15 |
+
from openai import OpenAI
|
16 |
+
|
17 |
+
# NVIDIA OpenAI API Setup
|
18 |
+
client = OpenAI(
|
19 |
+
base_url="https://integrate.api.nvidia.com/v1",
|
20 |
+
api_key="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" # Replace with actual API key
|
21 |
+
)
|
22 |
+
|
23 |
+
# Load Named Entity Recognition (NER) Model
|
24 |
+
nlp = pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-english")
|
25 |
+
|
26 |
+
st.set_page_config(page_title="OCR & Historical Analysis", page_icon="π", layout="wide")
|
27 |
+
|
28 |
+
# Custom Styling
|
29 |
+
def style_text(text):
|
30 |
+
return f"""
|
31 |
+
<div style='padding:10px;border-radius:10px;
|
32 |
+
background-color:#e0e0e0;
|
33 |
+
color:#333;
|
34 |
+
font-weight:500;
|
35 |
+
font-size:16px;'>
|
36 |
+
{text}
|
37 |
+
</div>
|
38 |
+
"""
|
39 |
+
|
40 |
+
def find_related_documents(query):
|
41 |
+
try:
|
42 |
+
search_results = wikipedia.search(query, results=5)
|
43 |
+
links = [wikipedia.page(result).url for result in search_results]
|
44 |
+
return links
|
45 |
+
except Exception as e:
|
46 |
+
return [f"Error retrieving related documents: {str(e)}"]
|
47 |
+
|
48 |
+
def geocode_location(location):
|
49 |
+
geolocator = Nominatim(user_agent="streamlit_app")
|
50 |
+
try:
|
51 |
+
loc = geolocator.geocode(location, timeout=10)
|
52 |
+
return (loc.latitude, loc.longitude) if loc else None
|
53 |
+
except GeocoderTimedOut:
|
54 |
+
return None
|
55 |
+
|
56 |
+
def generate_historical_context_nvidia(text):
|
57 |
+
"""Use NVIDIA OpenAI API to generate a structured, summarized historical context."""
|
58 |
+
|
59 |
+
prompt_analysis = f"""
|
60 |
+
Analyze the following text and provide a historical context. Identify:
|
61 |
+
- Key historical events
|
62 |
+
- Significant figures involved
|
63 |
+
- The broader historical significance
|
64 |
+
|
65 |
+
Text: {text}
|
66 |
+
|
67 |
+
Provide a detailed response.
|
68 |
+
"""
|
69 |
+
|
70 |
+
prompt_summary = """
|
71 |
+
Summarize the historical context provided above in a concise and structured format:
|
72 |
+
- Limit to 5 bullet points
|
73 |
+
- Each bullet point should be under 100 words
|
74 |
+
- Avoid unnecessary explanations or preambleβreturn only the summary
|
75 |
+
"""
|
76 |
+
|
77 |
+
try:
|
78 |
+
# Step 1: Generate Detailed Historical Context
|
79 |
+
completion = client.chat.completions.create(
|
80 |
+
model="deepseek-ai/deepseek-r1",
|
81 |
+
messages=[
|
82 |
+
{"role": "system", "content": "You are a historian providing detailed historical insights."},
|
83 |
+
{"role": "user", "content": prompt_analysis}
|
84 |
+
],
|
85 |
+
temperature=0.4,
|
86 |
+
top_p=0.9,
|
87 |
+
max_tokens=4096,
|
88 |
+
stream=False
|
89 |
+
)
|
90 |
+
detailed_response = completion.choices[0].message.content.strip()
|
91 |
+
|
92 |
+
# Step 2: Summarize the Historical Context **without Monologue**
|
93 |
+
summary_completion = client.chat.completions.create(
|
94 |
+
model="deepseek-ai/deepseek-r1",
|
95 |
+
messages=[
|
96 |
+
{"role": "system", "content": "You are an expert summarizer."},
|
97 |
+
{"role": "user", "content": f"{detailed_response}\n\n{prompt_summary}"}
|
98 |
+
],
|
99 |
+
temperature=0.4,
|
100 |
+
top_p=0.9,
|
101 |
+
max_tokens=2048,
|
102 |
+
stream=False
|
103 |
+
)
|
104 |
+
|
105 |
+
# Extract only the structured summary
|
106 |
+
summary_response = summary_completion.choices[0].message.content.strip()
|
107 |
+
|
108 |
+
# Remove AI-generated explanations or redundant preamble
|
109 |
+
clean_summary = re.sub(r"^.*?\n\n", "", summary_response, flags=re.DOTALL)
|
110 |
+
|
111 |
+
return clean_summary if clean_summary else "No historical context found."
|
112 |
+
|
113 |
+
except Exception as e:
|
114 |
+
return f"Error retrieving AI-generated historical context: {str(e)}"
|
115 |
+
|
116 |
+
|
117 |
+
class OS(enum.Enum):
|
118 |
+
Mac = 0
|
119 |
+
Windows = 1
|
120 |
+
|
121 |
+
class Languages(enum.Enum):
|
122 |
+
English = "eng"
|
123 |
+
Filipino = "fil"
|
124 |
+
Spanish = "spa"
|
125 |
+
|
126 |
+
class ImageReader:
|
127 |
+
def __init__(self, os):
|
128 |
+
if os == OS.Windows:
|
129 |
+
pytesseract.tesseract_cmd = 'C:/Program Files/Tesseract-OCR/tesseract.exe'
|
130 |
+
|
131 |
+
def extract_text(self, image: Image, lang: Languages):
|
132 |
+
extracted_text = pytesseract.image_to_string(image, lang=lang.value)
|
133 |
+
return ' '.join(extracted_text.split())
|
134 |
+
|
135 |
+
def extract_text_from_pdf(self, pdf_file, lang: Languages):
|
136 |
+
pdf_reader = PyPDF2.PdfReader(pdf_file)
|
137 |
+
text = "".join(page.extract_text() or "" for page in pdf_reader.pages)
|
138 |
+
return text
|
139 |
+
|
140 |
+
def extract_key_details(self, text):
|
141 |
+
details = {"dates": set(), "names": set(), "locations": set()}
|
142 |
+
date_pattern = r'\b(?:\d{1,2}[/\-]\d{1,2}[/\-]\d{2,4}|\d{4})\b'
|
143 |
+
details['dates'] = set(re.findall(date_pattern, text))
|
144 |
+
entities = nlp(text)
|
145 |
+
|
146 |
+
for entity in entities:
|
147 |
+
if "PER" in entity['entity']:
|
148 |
+
details['names'].add(entity['word'])
|
149 |
+
elif "LOC" in entity['entity']:
|
150 |
+
details['locations'].add(entity['word'])
|
151 |
+
|
152 |
+
return details
|
153 |
+
|
154 |
+
# UI Layout
|
155 |
+
st.title("π OCR & Historical Context Analyzer")
|
156 |
+
st.markdown("Extract text from images and PDFs, analyze named entities, and retrieve historical context.")
|
157 |
+
|
158 |
+
col1, col2 = st.columns([1, 2])
|
159 |
+
|
160 |
+
with col1:
|
161 |
+
selected_os = st.selectbox("π₯οΈ Select your OS", [OS.Windows, OS.Mac], format_func=lambda x: x.name)
|
162 |
+
selected_lang = st.selectbox("π Select language", list(Languages), format_func=lambda x: x.name)
|
163 |
+
uploaded_file = st.file_uploader("π Upload an image or PDF", type=["png", "jpg", "jpeg", "pdf"])
|
164 |
+
|
165 |
+
if uploaded_file:
|
166 |
+
ir = ImageReader(selected_os)
|
167 |
+
extracted_text = ""
|
168 |
+
if uploaded_file.type in ["image/png", "image/jpeg"]:
|
169 |
+
image = Image.open(uploaded_file)
|
170 |
+
st.image(image, caption="Uploaded Image", use_column_width=True)
|
171 |
+
extracted_text = ir.extract_text(image, selected_lang)
|
172 |
+
else:
|
173 |
+
extracted_text = ir.extract_text_from_pdf(uploaded_file, selected_lang)
|
174 |
+
|
175 |
+
st.markdown("### π Extracted Text:")
|
176 |
+
st.markdown(style_text(extracted_text), unsafe_allow_html=True)
|
177 |
+
|
178 |
+
key_details = ir.extract_key_details(extracted_text)
|
179 |
+
st.markdown("### π Extracted Key Details")
|
180 |
+
st.write(f"**π
Dates:** {', '.join(key_details['dates']) if key_details['dates'] else 'None found'}")
|
181 |
+
st.write(f"**π€ Names:** {', '.join(key_details['names']) if key_details['names'] else 'None found'}")
|
182 |
+
st.write(f"**π Locations:** {', '.join(key_details['locations']) if key_details['locations'] else 'None found'}")
|
183 |
+
|
184 |
+
combined_terms = ' '.join(key_details['dates'].union(key_details['locations']).union(key_details['names']))
|
185 |
+
historical_context = generate_historical_context_nvidia(combined_terms)
|
186 |
+
st.markdown("### ποΈ Historical Context")
|
187 |
+
st.markdown(style_text(historical_context), unsafe_allow_html=True)
|
188 |
+
|
189 |
+
st.markdown("### π Search the Web")
|
190 |
+
search_query = st.text_input("Enter a keyword or phrase:")
|
191 |
+
if search_query:
|
192 |
+
search_results = generate_historical_context_nvidia(search_query)
|
193 |
+
st.markdown(style_text(search_results), unsafe_allow_html=True)
|
194 |
+
|
195 |
+
related_docs = find_related_documents(combined_terms)
|
196 |
+
st.markdown("### π Related Historical Documents")
|
197 |
+
for link in related_docs:
|
198 |
+
st.markdown(f"[π {link}]({link})")
|
199 |
+
|
200 |
+
st.markdown("### πΊοΈ Map of Key Locations")
|
201 |
+
map_center = [10.0, 10.0]
|
202 |
+
map_obj = folium.Map(location=map_center, zoom_start=2)
|
203 |
+
|
204 |
+
for loc in key_details['locations']:
|
205 |
+
coords = geocode_location(loc)
|
206 |
+
if coords:
|
207 |
+
folium.Marker(coords, popup=loc).add_to(map_obj)
|
208 |
+
|
209 |
+
st_folium(map_obj, width=700, height=500)
|
requirements.txt
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
streamlit
|
2 |
+
pillow
|
3 |
+
pytesseract
|
4 |
+
pypdf2
|
5 |
+
transformers
|
6 |
+
openai
|
7 |
+
wikipedia-api
|
8 |
+
geopy
|
9 |
+
folium
|
10 |
+
streamlit-folium
|
setup.sh
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
apt-get update && apt-get install -y tesseract-ocr libtesseract-dev
|