DrishtiSharma commited on
Commit
9203c63
·
verified ·
1 Parent(s): 2c2a658

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +8 -3
app.py CHANGED
@@ -50,11 +50,11 @@ if "vector_store" not in st.session_state:
50
 
51
  # ----------------- Metadata Extraction -----------------
52
  def extract_metadata_llm(pdf_path):
53
- """Extracts metadata using LLM instead of regex."""
54
  with pdfplumber.open(pdf_path) as pdf:
55
  first_page_text = pdf.pages[0].extract_text() if pdf.pages else "No text found."
56
 
57
- #Define metadata_prompt
58
  metadata_prompt = PromptTemplate(
59
  input_variables=["text"],
60
  template="""
@@ -81,7 +81,7 @@ def extract_metadata_llm(pdf_path):
81
  """
82
  )
83
 
84
- # ✅ Now metadata_prompt exists when used in LLMChain
85
  metadata_chain = LLMChain(llm=llm_judge, prompt=metadata_prompt, output_key="metadata")
86
  metadata_response = metadata_chain.invoke({"text": first_page_text})
87
 
@@ -91,6 +91,11 @@ def extract_metadata_llm(pdf_path):
91
 
92
  try:
93
  metadata_dict = json.loads(json_text)
 
 
 
 
 
94
  except json.JSONDecodeError:
95
  metadata_dict = {
96
  "Title": "Unknown",
 
50
 
51
  # ----------------- Metadata Extraction -----------------
52
  def extract_metadata_llm(pdf_path):
53
+ """Extracts metadata using LLM"""
54
  with pdfplumber.open(pdf_path) as pdf:
55
  first_page_text = pdf.pages[0].extract_text() if pdf.pages else "No text found."
56
 
57
+ # Define metadata_prompt
58
  metadata_prompt = PromptTemplate(
59
  input_variables=["text"],
60
  template="""
 
81
  """
82
  )
83
 
84
+
85
  metadata_chain = LLMChain(llm=llm_judge, prompt=metadata_prompt, output_key="metadata")
86
  metadata_response = metadata_chain.invoke({"text": first_page_text})
87
 
 
91
 
92
  try:
93
  metadata_dict = json.loads(json_text)
94
+ # Ensure all expected fields are present
95
+ required_fields = ["Title", "Author", "Emails", "Affiliations"]
96
+ for field in required_fields:
97
+ if field not in metadata_dict:
98
+ metadata_dict[field] = "Unknown"
99
  except json.JSONDecodeError:
100
  metadata_dict = {
101
  "Title": "Unknown",