Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,169 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import fitz # PyMuPDF
|
2 |
+
import openai
|
3 |
+
import streamlit as st
|
4 |
+
from datetime import datetime
|
5 |
+
import json
|
6 |
+
import random
|
7 |
+
from PIL import Image, ImageDraw, ImageFont
|
8 |
+
from io import BytesIO
|
9 |
+
|
10 |
+
# Initialize OpenAI API key
|
11 |
+
openai.api_key = "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"
|
12 |
+
# Metadata template
|
13 |
+
metadata_template = {
|
14 |
+
"catalog_name": "MeData",
|
15 |
+
"file_name": "",
|
16 |
+
"file_directory": [],
|
17 |
+
"file_type": [],
|
18 |
+
"page_count": [],
|
19 |
+
"storage_type": ["local"],
|
20 |
+
"last_modified": [],
|
21 |
+
"chunks": {}
|
22 |
+
}
|
23 |
+
|
24 |
+
# Function to extract text from PDF
|
25 |
+
def extract_text_from_pdf(file):
|
26 |
+
doc = fitz.open(stream=file.read(), filetype="pdf")
|
27 |
+
pages_content = []
|
28 |
+
for page_num in range(doc.page_count):
|
29 |
+
page = doc[page_num]
|
30 |
+
pages_content.append(page.get_text())
|
31 |
+
return pages_content
|
32 |
+
|
33 |
+
# Function to create metadata template
|
34 |
+
def create_metadata_template(file_name):
|
35 |
+
metadata = metadata_template.copy()
|
36 |
+
metadata["file_name"] = file_name
|
37 |
+
metadata["last_modified"] = [datetime.now().isoformat()]
|
38 |
+
return metadata
|
39 |
+
|
40 |
+
# Function to detect tags using OpenAI API
|
41 |
+
def detect_tags_with_openai(text):
|
42 |
+
prompt = (
|
43 |
+
"Extract key information as JSON format where each key has a 'value' and 'evidence'. "
|
44 |
+
f"Text: {text}"
|
45 |
+
)
|
46 |
+
|
47 |
+
response = openai.ChatCompletion.create(
|
48 |
+
model="gpt-4",
|
49 |
+
messages=[
|
50 |
+
{"role": "system", "content": "You are a helpful assistant that extracts structured data."},
|
51 |
+
{"role": "user", "content": prompt}
|
52 |
+
],
|
53 |
+
max_tokens=500,
|
54 |
+
temperature=0
|
55 |
+
)
|
56 |
+
|
57 |
+
response_text = response.choices[0].message['content'].strip()
|
58 |
+
try:
|
59 |
+
extracted_tags = json.loads(response_text)
|
60 |
+
except json.JSONDecodeError:
|
61 |
+
st.error("Error: Unable to parse JSON response from OpenAI")
|
62 |
+
extracted_tags = {}
|
63 |
+
|
64 |
+
return extracted_tags
|
65 |
+
|
66 |
+
# Parse PDF and generate metadata
|
67 |
+
def parse_to_metadata(file, file_name):
|
68 |
+
metadata = create_metadata_template(file_name)
|
69 |
+
|
70 |
+
if file_name.endswith(".pdf"):
|
71 |
+
pages_content = extract_text_from_pdf(file)
|
72 |
+
metadata["page_count"] = [len(pages_content)]
|
73 |
+
|
74 |
+
for i, page_text in enumerate(pages_content):
|
75 |
+
chunk_key = f"{i}"
|
76 |
+
metadata["chunks"][chunk_key] = {"page_range": [str(i + 1)]}
|
77 |
+
extracted_tags = detect_tags_with_openai(page_text)
|
78 |
+
|
79 |
+
for tag, tag_data in extracted_tags.items():
|
80 |
+
metadata["chunks"][chunk_key][tag] = tag_data
|
81 |
+
|
82 |
+
if tag not in metadata:
|
83 |
+
metadata[tag] = []
|
84 |
+
metadata[tag].append(tag_data.get("value", ""))
|
85 |
+
|
86 |
+
return metadata
|
87 |
+
|
88 |
+
# Generate random color
|
89 |
+
def random_color():
|
90 |
+
return (random.randint(100, 255), random.randint(100, 255), random.randint(100, 255))
|
91 |
+
|
92 |
+
# Generate image with colored tag wrappers
|
93 |
+
def generate_colored_tags_image(metadata):
|
94 |
+
img_width, img_height = 800, 1000
|
95 |
+
img = Image.new("RGB", (img_width, img_height), "white")
|
96 |
+
draw = ImageDraw.Draw(img)
|
97 |
+
|
98 |
+
font = ImageFont.load_default()
|
99 |
+
y_position = 20
|
100 |
+
|
101 |
+
for chunk in metadata["chunks"].values():
|
102 |
+
for tag, tag_data in chunk.items():
|
103 |
+
if "value" in tag_data:
|
104 |
+
value = tag_data["value"]
|
105 |
+
color = random_color()
|
106 |
+
text = f"{tag}: {value}"
|
107 |
+
|
108 |
+
text_bbox = draw.textbbox((20, y_position), text, font=font)
|
109 |
+
text_width = text_bbox[2] - text_bbox[0]
|
110 |
+
text_height = text_bbox[3] - text_bbox[1]
|
111 |
+
|
112 |
+
draw.rectangle(
|
113 |
+
[(20, y_position), (20 + text_width + 10, y_position + text_height + 10)],
|
114 |
+
fill=color
|
115 |
+
)
|
116 |
+
|
117 |
+
draw.text((25, y_position + 5), text, fill="black", font=font)
|
118 |
+
y_position += text_height + 20
|
119 |
+
|
120 |
+
if y_position > img_height - 40:
|
121 |
+
img = img.resize((img_width, y_position + 40))
|
122 |
+
draw = ImageDraw.Draw(img)
|
123 |
+
|
124 |
+
return img
|
125 |
+
|
126 |
+
# Streamlit UI
|
127 |
+
st.title("Metadata Generator Tool")
|
128 |
+
|
129 |
+
uploaded_file = st.file_uploader("Upload a PDF or Image", type=["pdf", "jpg", "png"])
|
130 |
+
|
131 |
+
if uploaded_file:
|
132 |
+
file_name = uploaded_file.name
|
133 |
+
file_type = file_name.split(".")[-1]
|
134 |
+
|
135 |
+
st.write(f"**File Name:** {file_name}")
|
136 |
+
st.write(f"**File Type:** {file_type}")
|
137 |
+
|
138 |
+
if file_type == "pdf":
|
139 |
+
metadata = parse_to_metadata(uploaded_file, file_name)
|
140 |
+
|
141 |
+
# Save metadata as JSON
|
142 |
+
output_json = json.dumps(metadata, indent=4)
|
143 |
+
st.download_button(
|
144 |
+
label="Download Metadata as JSON",
|
145 |
+
data=output_json,
|
146 |
+
file_name="metadata.json",
|
147 |
+
mime="application/json"
|
148 |
+
)
|
149 |
+
|
150 |
+
# Display metadata
|
151 |
+
st.subheader("Extracted Metadata:")
|
152 |
+
st.json(metadata)
|
153 |
+
|
154 |
+
# Generate and display image with tags
|
155 |
+
st.subheader("Visualized Tags:")
|
156 |
+
tag_image = generate_colored_tags_image(metadata)
|
157 |
+
img_bytes = BytesIO()
|
158 |
+
tag_image.save(img_bytes, format="PNG")
|
159 |
+
st.image(tag_image, caption="Tags Visualization")
|
160 |
+
|
161 |
+
# Download tag image
|
162 |
+
st.download_button(
|
163 |
+
label="Download Tag Visualization Image",
|
164 |
+
data=img_bytes.getvalue(),
|
165 |
+
file_name="tag_visualization.png",
|
166 |
+
mime="image/png"
|
167 |
+
)
|
168 |
+
else:
|
169 |
+
st.error("Currently, only PDF files are supported for metadata generation.")
|