JEPHONETORRE commited on
Commit
11bfd4a
Β·
1 Parent(s): 7f65832
Files changed (3) hide show
  1. .env +1 -0
  2. app.py +190 -0
  3. requirements.txt +10 -0
.env ADDED
@@ -0,0 +1 @@
 
 
1
+ AIzaSyBbzFCa84gRACICF9JrjGtonTl8UIdNOPs
app.py ADDED
@@ -0,0 +1,190 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import streamlit as st
3
+ import requests
4
+ from PyPDF2 import PdfReader
5
+ from PIL import Image
6
+ import re
7
+ from collections import Counter
8
+ from streamlit_option_menu import option_menu
9
+ import folium
10
+ from streamlit_folium import st_folium
11
+ from geopy.geocoders import Nominatim
12
+
13
+ # Fetch GEMINI API key from environment variables
14
+ gemini_api_key = os.getenv("HF_API_KEY") # Make sure the environment variable is set correctly
15
+
16
+ if gemini_api_key is None:
17
+ st.error("API key not found. Please set the GEMINI_API_KEY environment variable.")
18
+ else:
19
+ # Define the URL for Gemini API
20
+ url = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent?key={gemini_api_key}"
21
+
22
+ # Define headers for the API request
23
+ headers = {
24
+ 'Content-Type': 'application/json'
25
+ }
26
+
27
+ # Function to call the Gemini API
28
+ def call_gemini_api(prompt):
29
+ data = {
30
+ "contents": [
31
+ {
32
+ "parts": [
33
+ {"text": prompt}
34
+ ]
35
+ }
36
+ ]
37
+ }
38
+
39
+ try:
40
+ response = requests.post(url, json=data, headers=headers)
41
+
42
+ # Check if the response is successful (HTTP status 200)
43
+ if response.status_code == 200:
44
+ response_data = response.json()
45
+ generated_content = response_data.get('generatedContent')
46
+
47
+ if generated_content:
48
+ return generated_content
49
+ else:
50
+ return "No generated content found."
51
+ else:
52
+ return f"Error: {response.status_code}, {response.text}"
53
+
54
+ except requests.exceptions.RequestException as e:
55
+ return f"An error occurred: {e}"
56
+
57
+ # OCR and Analysis Functions
58
+ def extract_text_from_pdf(file):
59
+ pdf_reader = PdfReader(file)
60
+ return "\n".join([page.extract_text() for page in pdf_reader.pages if page.extract_text()])
61
+
62
+ def extract_text_from_image(image):
63
+ from pytesseract import image_to_string # Requires pytesseract library
64
+ return image_to_string(image)
65
+
66
+ def extract_keywords(text, num_keywords=10):
67
+ words = re.findall(r'\b\w{4,}\b', text.lower()) # Extract words with 4+ letters
68
+ common_words = set("the and for with from this that have will are was were been has".split()) # Stop words
69
+ filtered_words = [word for word in words if word not in common_words]
70
+ most_common = Counter(filtered_words).most_common(num_keywords)
71
+ return [word for word, _ in most_common]
72
+
73
+ def contextualize_document(text):
74
+ """Generate historical context based on document text."""
75
+ return call_gemini_api(f"Provide historical context for the following text:\n\n{text[:1000]}")
76
+
77
+ def extract_locations(text):
78
+ """Dummy function to extract location names from text. Replace with NLP-based extraction."""
79
+ # For example purposes, manually returning some locations
80
+ return ["Manila, Philippines", "Cebu City, Philippines"]
81
+
82
+ def geocode_locations(locations):
83
+ """Geocode location names to latitude and longitude using a geocoding service."""
84
+ geolocator = Nominatim(user_agent="geoapi")
85
+ geocoded_locations = []
86
+ for location in locations:
87
+ try:
88
+ geo_data = geolocator.geocode(location)
89
+ if geo_data:
90
+ geocoded_locations.append((location, geo_data.latitude, geo_data.longitude))
91
+ except Exception as e:
92
+ st.warning(f"Could not geocode location: {location}. Error: {e}")
93
+ return geocoded_locations
94
+
95
+ # Streamlit UI Setup
96
+ st.set_page_config(page_title="AI-Powered Historical Document Analysis", layout="wide", page_icon=":scroll:")
97
+ st.title("πŸ“œ AI-Powered Historical Document Deciphering and Contextualization")
98
+
99
+ with st.expander("πŸ“– **What is this app about?**"):
100
+ st.write("""
101
+ The **AI-Powered Historical Document Deciphering and Contextualization** app leverages advanced AI to assist
102
+ historians and researchers in analyzing historical documents. It can process handwritten manuscripts, old prints, and maps
103
+ to extract key information, provide contextual insights, and visualize data on modern maps.
104
+ """)
105
+
106
+ # Compact Navigation
107
+ selected_tab = option_menu(
108
+ menu_title="",
109
+ options=["Home", "Key Points", "General Contents", "Historical Context", "Geospatial Visualization", "Human-AI Collaboration", "Knowledge Graphs"],
110
+ icons=["house", "key", "book", "clock", "globe", "handshake", "share-alt"],
111
+ menu_icon="cast",
112
+ default_index=0,
113
+ orientation="horizontal",
114
+ )
115
+
116
+ # Upload Section
117
+ uploaded_file = st.file_uploader("Upload an image or PDF of the historical document", type=["pdf", "png", "jpg", "jpeg"])
118
+
119
+ if uploaded_file:
120
+ file_name = uploaded_file.name # Get the name of the uploaded file
121
+ st.subheader(f"Uploaded File: {file_name}")
122
+
123
+ if file_name.endswith(".pdf"):
124
+ document_text = extract_text_from_pdf(uploaded_file)
125
+ else: # Image files
126
+ image = Image.open(uploaded_file)
127
+ document_text = extract_text_from_image(image)
128
+
129
+ st.session_state["document_text"] = document_text
130
+ st.success("Document uploaded and processed successfully!")
131
+
132
+ if selected_tab == "Home":
133
+ st.header("πŸ—Ž Document Overview")
134
+ st.write("The uploaded document has been processed. Navigate to the other tabs for detailed analysis.")
135
+
136
+ elif selected_tab == "Key Points":
137
+ st.header("πŸ”‘ Key Information")
138
+ keywords = extract_keywords(document_text)
139
+ st.write(", ".join(keywords))
140
+
141
+ elif selected_tab == "General Contents":
142
+ st.header("πŸ“œ General Contents")
143
+ st.text_area("Document Text", value=document_text, height=300, disabled=True)
144
+
145
+ elif selected_tab == "Historical Context":
146
+ st.header("πŸ•° Historical Context")
147
+ with st.spinner("Generating historical context..."):
148
+ context = contextualize_document(document_text)
149
+ st.markdown(context)
150
+
151
+ elif selected_tab == "Geospatial Visualization":
152
+ st.header("🌍 Geospatial Data Integration and Visualization")
153
+ with st.spinner("Extracting locations and preparing map..."):
154
+ locations = extract_locations(document_text)
155
+ geocoded_locations = geocode_locations(locations)
156
+
157
+ if geocoded_locations:
158
+ m = folium.Map(location=[10.3157, 123.8854], zoom_start=6) # Default location: Cebu, Philippines
159
+ for loc, lat, lon in geocoded_locations:
160
+ folium.Marker([lat, lon], popup=loc).add_to(m)
161
+
162
+ st_folium(m, width=700, height=500)
163
+ else:
164
+ st.warning("No geocoded locations available. Ensure the document contains valid location data.")
165
+
166
+ elif selected_tab == "Human-AI Collaboration":
167
+ st.header("🀝 Human-AI Collaboration")
168
+ corrected_text = st.text_area("Edit the extracted text below if there are OCR errors:", value=document_text, height=300)
169
+
170
+ if st.button("Generate Historical Insights"):
171
+ with st.spinner("Analyzing text for insights..."):
172
+ insights = contextualize_document(corrected_text)
173
+ st.markdown(insights)
174
+
175
+ if st.button("Generate Alternative Readings"):
176
+ with st.spinner("Generating alternative readings..."):
177
+ alternative_readings = contextualize_document(corrected_text + "\n\nProvide alternative readings:")
178
+ st.markdown(alternative_readings)
179
+
180
+ st.write("### Related Historical Documents")
181
+ st.markdown("""
182
+ - [Historical Archive 1](https://www.example.com/archive1)
183
+ - [Historical Archive 2](https://www.example.com/archive2)
184
+ """)
185
+
186
+ elif selected_tab == "Knowledge Graphs":
187
+ st.header("πŸ“Š Historical Context Linkage via Knowledge Graphs")
188
+ with st.spinner("Generating knowledge graph..."):
189
+ graph_data = contextualize_document(document_text)
190
+ st.text_area("Knowledge Graph Data", value=graph_data, height=300, disabled=True)
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ streamlit
2
+ PyPDF2
3
+ pillow
4
+ huggingface_hub
5
+ streamlit-option-menu
6
+ pytesseract
7
+ folium
8
+ streamlit-folium
9
+ geopy
10
+ requests