subramanyamrekhandar commited on
Commit
b62b8ee
·
verified ·
1 Parent(s): 9a376e1

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +169 -0
app.py ADDED
@@ -0,0 +1,169 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import fitz # PyMuPDF
2
+ import openai
3
+ import streamlit as st
4
+ from datetime import datetime
5
+ import json
6
+ import random
7
+ from PIL import Image, ImageDraw, ImageFont
8
+ from io import BytesIO
9
+
10
+ # Initialize OpenAI API key
11
+ openai.api_key = "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"
12
+ # Metadata template
13
+ metadata_template = {
14
+ "catalog_name": "MeData",
15
+ "file_name": "",
16
+ "file_directory": [],
17
+ "file_type": [],
18
+ "page_count": [],
19
+ "storage_type": ["local"],
20
+ "last_modified": [],
21
+ "chunks": {}
22
+ }
23
+
24
+ # Function to extract text from PDF
25
+ def extract_text_from_pdf(file):
26
+ doc = fitz.open(stream=file.read(), filetype="pdf")
27
+ pages_content = []
28
+ for page_num in range(doc.page_count):
29
+ page = doc[page_num]
30
+ pages_content.append(page.get_text())
31
+ return pages_content
32
+
33
+ # Function to create metadata template
34
+ def create_metadata_template(file_name):
35
+ metadata = metadata_template.copy()
36
+ metadata["file_name"] = file_name
37
+ metadata["last_modified"] = [datetime.now().isoformat()]
38
+ return metadata
39
+
40
+ # Function to detect tags using OpenAI API
41
+ def detect_tags_with_openai(text):
42
+ prompt = (
43
+ "Extract key information as JSON format where each key has a 'value' and 'evidence'. "
44
+ f"Text: {text}"
45
+ )
46
+
47
+ response = openai.ChatCompletion.create(
48
+ model="gpt-4",
49
+ messages=[
50
+ {"role": "system", "content": "You are a helpful assistant that extracts structured data."},
51
+ {"role": "user", "content": prompt}
52
+ ],
53
+ max_tokens=500,
54
+ temperature=0
55
+ )
56
+
57
+ response_text = response.choices[0].message['content'].strip()
58
+ try:
59
+ extracted_tags = json.loads(response_text)
60
+ except json.JSONDecodeError:
61
+ st.error("Error: Unable to parse JSON response from OpenAI")
62
+ extracted_tags = {}
63
+
64
+ return extracted_tags
65
+
66
+ # Parse PDF and generate metadata
67
+ def parse_to_metadata(file, file_name):
68
+ metadata = create_metadata_template(file_name)
69
+
70
+ if file_name.endswith(".pdf"):
71
+ pages_content = extract_text_from_pdf(file)
72
+ metadata["page_count"] = [len(pages_content)]
73
+
74
+ for i, page_text in enumerate(pages_content):
75
+ chunk_key = f"{i}"
76
+ metadata["chunks"][chunk_key] = {"page_range": [str(i + 1)]}
77
+ extracted_tags = detect_tags_with_openai(page_text)
78
+
79
+ for tag, tag_data in extracted_tags.items():
80
+ metadata["chunks"][chunk_key][tag] = tag_data
81
+
82
+ if tag not in metadata:
83
+ metadata[tag] = []
84
+ metadata[tag].append(tag_data.get("value", ""))
85
+
86
+ return metadata
87
+
88
+ # Generate random color
89
+ def random_color():
90
+ return (random.randint(100, 255), random.randint(100, 255), random.randint(100, 255))
91
+
92
+ # Generate image with colored tag wrappers
93
+ def generate_colored_tags_image(metadata):
94
+ img_width, img_height = 800, 1000
95
+ img = Image.new("RGB", (img_width, img_height), "white")
96
+ draw = ImageDraw.Draw(img)
97
+
98
+ font = ImageFont.load_default()
99
+ y_position = 20
100
+
101
+ for chunk in metadata["chunks"].values():
102
+ for tag, tag_data in chunk.items():
103
+ if "value" in tag_data:
104
+ value = tag_data["value"]
105
+ color = random_color()
106
+ text = f"{tag}: {value}"
107
+
108
+ text_bbox = draw.textbbox((20, y_position), text, font=font)
109
+ text_width = text_bbox[2] - text_bbox[0]
110
+ text_height = text_bbox[3] - text_bbox[1]
111
+
112
+ draw.rectangle(
113
+ [(20, y_position), (20 + text_width + 10, y_position + text_height + 10)],
114
+ fill=color
115
+ )
116
+
117
+ draw.text((25, y_position + 5), text, fill="black", font=font)
118
+ y_position += text_height + 20
119
+
120
+ if y_position > img_height - 40:
121
+ img = img.resize((img_width, y_position + 40))
122
+ draw = ImageDraw.Draw(img)
123
+
124
+ return img
125
+
126
+ # Streamlit UI
127
+ st.title("Metadata Generator Tool")
128
+
129
+ uploaded_file = st.file_uploader("Upload a PDF or Image", type=["pdf", "jpg", "png"])
130
+
131
+ if uploaded_file:
132
+ file_name = uploaded_file.name
133
+ file_type = file_name.split(".")[-1]
134
+
135
+ st.write(f"**File Name:** {file_name}")
136
+ st.write(f"**File Type:** {file_type}")
137
+
138
+ if file_type == "pdf":
139
+ metadata = parse_to_metadata(uploaded_file, file_name)
140
+
141
+ # Save metadata as JSON
142
+ output_json = json.dumps(metadata, indent=4)
143
+ st.download_button(
144
+ label="Download Metadata as JSON",
145
+ data=output_json,
146
+ file_name="metadata.json",
147
+ mime="application/json"
148
+ )
149
+
150
+ # Display metadata
151
+ st.subheader("Extracted Metadata:")
152
+ st.json(metadata)
153
+
154
+ # Generate and display image with tags
155
+ st.subheader("Visualized Tags:")
156
+ tag_image = generate_colored_tags_image(metadata)
157
+ img_bytes = BytesIO()
158
+ tag_image.save(img_bytes, format="PNG")
159
+ st.image(tag_image, caption="Tags Visualization")
160
+
161
+ # Download tag image
162
+ st.download_button(
163
+ label="Download Tag Visualization Image",
164
+ data=img_bytes.getvalue(),
165
+ file_name="tag_visualization.png",
166
+ mime="image/png"
167
+ )
168
+ else:
169
+ st.error("Currently, only PDF files are supported for metadata generation.")