Avanisha commited on
Commit
3d8e4a7
·
verified ·
1 Parent(s): 2e8ee76

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +118 -133
app.py CHANGED
@@ -4,16 +4,13 @@ import nltk
4
  import fitz
5
  import random
6
  import base64
7
- import logging
8
  import pycountry
9
  from PIL import Image
10
  import streamlit as st
11
- from fastapi import FastAPI
12
  from langdetect import detect
13
  from config import load_config
14
  from dotenv import load_dotenv
15
  from nltk.corpus import stopwords
16
- from fastapi import FastAPI, Query
17
  from langchain_groq import ChatGroq
18
  from collections import defaultdict
19
  from log_utils import setup_logging
@@ -21,7 +18,6 @@ from nltk.tokenize import sent_tokenize
21
  from nltk.tokenize import word_tokenize
22
  from langchain.chains import RetrievalQA
23
  from upload_pdf import update_or_add_pdf
24
- from fastapi.responses import JSONResponse
25
  from langchain.prompts import ChatPromptTemplate
26
  from langchain_community.vectorstores import Chroma
27
  from sklearn.metrics.pairwise import cosine_similarity
@@ -34,130 +30,7 @@ logger = setup_logging('app')
34
  nltk.download('punkt')
35
  nltk.download('punkt_tab')
36
  nltk.download('stopwords')
37
-
38
- app = FastAPI()
39
-
40
- @app.get("/pdf-details")
41
- async def get_pdf_details(
42
- filename: str = Query(..., description="Filename of the PDF"),
43
- page_number: int = Query(0, description="Page number (0-indexed)")
44
- ):
45
- logger.info(f"Processing PDF details request for file: {filename}, page: {page_number}")
46
- try:
47
- data_path = "/home/bacancy/Documents/Company/Smart PDF Search/data"
48
- file_path = os.path.join(data_path, filename)
49
-
50
- # Open the PDF
51
- logger.debug(f"Opening PDF file: {file_path}")
52
- doc = fitz.open(file_path)
53
-
54
- # Extract full PDF text
55
- full_text = ""
56
- for page in doc:
57
- full_text += page.get_text()
58
-
59
- # Get PDF metadata
60
- pdf_metadata = doc.metadata or {}
61
-
62
- # Extract page text and render page image
63
- page = doc.load_page(page_number)
64
- page_text = page.get_text()
65
-
66
- # Render page as image
67
- pix = page.get_pixmap()
68
- page_image_base64 = base64.b64encode(pix.tobytes("png")).decode('utf-8')
69
-
70
- # Detect language
71
- try:
72
- lang_code = detect(page_text)
73
- language = pycountry.languages.get(alpha_2=lang_code).name
74
- except Exception as e:
75
- logger.warning(f"Language detection failed: {str(e)}")
76
- language = 'Unknown'
77
-
78
- # Prepare response
79
- response = {
80
- "file_path": file_path,
81
- "filename": os.path.basename(file_path),
82
- "total_pages": len(doc),
83
- "current_page": page_number + 1,
84
- "full_text": full_text,
85
- "page_text": page_text,
86
- "page_image": page_image_base64,
87
- "file_size_bytes": os.path.getsize(file_path),
88
- "file_size_kb": f"{os.path.getsize(file_path) / 1024:.2f} KB",
89
- "language": language,
90
- "metadata": {
91
- "title": pdf_metadata.get('title', 'Unknown'),
92
- "author": pdf_metadata.get('author', 'Unknown'),
93
- "creator": pdf_metadata.get('creator', 'Unknown'),
94
- "producer": pdf_metadata.get('producer', 'Unknown')
95
- }
96
- }
97
-
98
- logger.info(f"Successfully processed PDF details for {filename}")
99
- return JSONResponse(content=response)
100
-
101
- except Exception as e:
102
- logger.error(f"Error processing PDF details: {str(e)}", exc_info=True)
103
- return JSONResponse(
104
- content={"error": str(e)},
105
- status_code=500
106
- )
107
-
108
- @app.get("/romanized-text")
109
- async def get_romanized_text(
110
- filename: str = Query(..., description="Filename of the PDF")
111
- ):
112
- logger.info(f"Processing romanized text request for file: {filename}")
113
- try:
114
- data_path = "/home/bacancy/Documents/Company/Smart PDF Search/data"
115
- file_path = os.path.join(data_path, filename)
116
-
117
- # Open the PDF
118
- logger.debug(f"Opening PDF file for romanization: {file_path}")
119
- doc = fitz.open(file_path)
120
-
121
- # Extract full PDF text
122
- full_text = ""
123
- pages_text = []
124
 
125
- for page in doc:
126
- page_text = page.get_text()
127
- full_text += page_text
128
- # Add page info to pages_text list
129
- pages_text.append({
130
- "page_number": page.number + 1, # Adding 1 to make it 1-based instead of 0-based
131
- "text": page_text
132
- })
133
-
134
- # Get PDF metadata
135
- pdf_metadata = doc.metadata or {}
136
-
137
- response = {
138
- "filename": os.path.basename(file_path),
139
- "total_pages": len(doc),
140
- "full_text": full_text,
141
- "pages": pages_text,
142
- "file_size_kb": f"{os.path.getsize(file_path) / 1024:.2f} KB",
143
- "metadata": {
144
- "title": pdf_metadata.get('title', 'Unknown'),
145
- "author": pdf_metadata.get('author', 'Unknown'),
146
- "creator": pdf_metadata.get('creator', 'Unknown'),
147
- "producer": pdf_metadata.get('producer', 'Unknown')
148
- }
149
- }
150
-
151
- logger.info(f"Successfully processed romanized text for {filename}")
152
- return JSONResponse(content=response)
153
-
154
- except Exception as e:
155
- logger.error(f"Error processing romanized text: {str(e)}", exc_info=True)
156
- return JSONResponse(
157
- content={"error": str(e)},
158
- status_code=500
159
- )
160
-
161
  # Load environment variables
162
  load_dotenv()
163
 
@@ -573,7 +446,7 @@ def display_source_documents_with_images(source_documents, query):
573
  highlighted_snippet = highlight_query_words(snippet, query)
574
 
575
  st.markdown(f'<div class="source-content">{highlighted_snippet}</div>', unsafe_allow_html=True)
576
- # st.markdown(f"[View other results in this book](?page=pdf_details&filename={pdf_name}&page_number={page_number})", unsafe_allow_html=True)
577
 
578
  logger.debug(f"Successfully displayed content for {pdf_name}, page {page_number + 1}")
579
 
@@ -628,6 +501,110 @@ def is_query_relevant(question, source_documents, threshold=0.1):
628
  logger.error(f"Error checking query relevance: {str(e)}", exc_info=True)
629
  return False
630
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
631
  def main():
632
  logger.info("Starting Smart PDF Search application")
633
 
@@ -643,16 +620,24 @@ def main():
643
  logger.info(f"Displaying PDF details for {filename}, page {page_number}")
644
 
645
  if filename:
646
- display_pdf_details(filename, page_number)
647
- else:
648
- logger.warning("No filename provided for PDF details")
649
- st.error("No filename provided for PDF details")
 
 
 
650
  elif page == 'romanized_text':
651
  filename = query_params.get('filename', '')
652
  logger.info(f"Displaying romanized text for {filename}")
653
 
654
  if filename:
655
- display_romanized_text_page(filename)
 
 
 
 
 
656
  else:
657
  logger.warning("No filename provided for Romanized text")
658
  st.error("No filename provided for Romanized text")
 
4
  import fitz
5
  import random
6
  import base64
 
7
  import pycountry
8
  from PIL import Image
9
  import streamlit as st
 
10
  from langdetect import detect
11
  from config import load_config
12
  from dotenv import load_dotenv
13
  from nltk.corpus import stopwords
 
14
  from langchain_groq import ChatGroq
15
  from collections import defaultdict
16
  from log_utils import setup_logging
 
18
  from nltk.tokenize import word_tokenize
19
  from langchain.chains import RetrievalQA
20
  from upload_pdf import update_or_add_pdf
 
21
  from langchain.prompts import ChatPromptTemplate
22
  from langchain_community.vectorstores import Chroma
23
  from sklearn.metrics.pairwise import cosine_similarity
 
30
  nltk.download('punkt')
31
  nltk.download('punkt_tab')
32
  nltk.download('stopwords')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  # Load environment variables
35
  load_dotenv()
36
 
 
446
  highlighted_snippet = highlight_query_words(snippet, query)
447
 
448
  st.markdown(f'<div class="source-content">{highlighted_snippet}</div>', unsafe_allow_html=True)
449
+ st.markdown(f"[View other results in this book](?page=pdf_details&filename={pdf_name}&page_number={page_number})", unsafe_allow_html=True)
450
 
451
  logger.debug(f"Successfully displayed content for {pdf_name}, page {page_number + 1}")
452
 
 
501
  logger.error(f"Error checking query relevance: {str(e)}", exc_info=True)
502
  return False
503
 
504
+ def get_pdf_details(filename, page_number):
505
+ """Get details of a specific PDF page."""
506
+ logger.info(f"Processing PDF details for file: {filename}, page: {page_number}")
507
+ try:
508
+ data_path = "/home/bacancy/Documents/Company/Smart PDF Search/data"
509
+ file_path = os.path.join(data_path, filename)
510
+
511
+ # Open the PDF
512
+ logger.debug(f"Opening PDF file: {file_path}")
513
+ doc = fitz.open(file_path)
514
+
515
+ # Extract full PDF text
516
+ full_text = ""
517
+ for page in doc:
518
+ full_text += page.get_text()
519
+
520
+ # Get PDF metadata
521
+ pdf_metadata = doc.metadata or {}
522
+
523
+ # Extract page text and render page image
524
+ page = doc.load_page(page_number)
525
+ page_text = page.get_text()
526
+
527
+ # Render page as image
528
+ pix = page.get_pixmap()
529
+ img_bytes = pix.tobytes("png")
530
+ page_image_base64 = base64.b64encode(img_bytes).decode('utf-8')
531
+
532
+ # Detect language
533
+ try:
534
+ lang_code = detect(page_text)
535
+ language = pycountry.languages.get(alpha_2=lang_code).name
536
+ except Exception as e:
537
+ logger.warning(f"Language detection failed: {str(e)}")
538
+ language = 'Unknown'
539
+
540
+ # Prepare response
541
+ return {
542
+ "file_path": file_path,
543
+ "filename": os.path.basename(file_path),
544
+ "total_pages": len(doc),
545
+ "current_page": page_number + 1,
546
+ "full_text": full_text,
547
+ "page_text": page_text,
548
+ "page_image": page_image_base64,
549
+ "file_size_bytes": os.path.getsize(file_path),
550
+ "file_size_kb": f"{os.path.getsize(file_path) / 1024:.2f} KB",
551
+ "language": language,
552
+ "metadata": {
553
+ "title": pdf_metadata.get('title', 'Unknown'),
554
+ "author": pdf_metadata.get('author', 'Unknown'),
555
+ "creator": pdf_metadata.get('creator', 'Unknown'),
556
+ "producer": pdf_metadata.get('producer', 'Unknown')
557
+ }
558
+ }
559
+
560
+ except Exception as e:
561
+ logger.error(f"Error processing PDF details: {str(e)}", exc_info=True)
562
+ raise
563
+
564
+ def get_romanized_text(filename):
565
+ """Get romanized text from a PDF."""
566
+ logger.info(f"Processing romanized text for file: {filename}")
567
+ try:
568
+ data_path = "/home/bacancy/Documents/Company/Smart PDF Search/data"
569
+ file_path = os.path.join(data_path, filename)
570
+
571
+ # Open the PDF
572
+ logger.debug(f"Opening PDF file for romanization: {file_path}")
573
+ doc = fitz.open(file_path)
574
+
575
+ # Extract full PDF text
576
+ full_text = ""
577
+ pages_text = []
578
+
579
+ for page in doc:
580
+ page_text = page.get_text()
581
+ full_text += page_text
582
+ pages_text.append({
583
+ "page_number": page.number + 1,
584
+ "text": page_text
585
+ })
586
+
587
+ # Get PDF metadata
588
+ pdf_metadata = doc.metadata or {}
589
+
590
+ return {
591
+ "filename": os.path.basename(file_path),
592
+ "total_pages": len(doc),
593
+ "full_text": full_text,
594
+ "pages": pages_text,
595
+ "file_size_kb": f"{os.path.getsize(file_path) / 1024:.2f} KB",
596
+ "metadata": {
597
+ "title": pdf_metadata.get('title', 'Unknown'),
598
+ "author": pdf_metadata.get('author', 'Unknown'),
599
+ "creator": pdf_metadata.get('creator', 'Unknown'),
600
+ "producer": pdf_metadata.get('producer', 'Unknown')
601
+ }
602
+ }
603
+
604
+ except Exception as e:
605
+ logger.error(f"Error processing romanized text: {str(e)}", exc_info=True)
606
+ raise
607
+
608
  def main():
609
  logger.info("Starting Smart PDF Search application")
610
 
 
620
  logger.info(f"Displaying PDF details for {filename}, page {page_number}")
621
 
622
  if filename:
623
+ try:
624
+ pdf_details = get_pdf_details(filename, page_number)
625
+ display_pdf_details(pdf_details, filename)
626
+ except Exception as e:
627
+ logger.error(f"Error displaying PDF details: {str(e)}")
628
+ st.error(f"Error displaying PDF details: {str(e)}")
629
+
630
  elif page == 'romanized_text':
631
  filename = query_params.get('filename', '')
632
  logger.info(f"Displaying romanized text for {filename}")
633
 
634
  if filename:
635
+ try:
636
+ romanized_data = get_romanized_text(filename)
637
+ display_romanized_text_page(romanized_data)
638
+ except Exception as e:
639
+ logger.error(f"Error displaying romanized text: {str(e)}")
640
+ st.error(f"Error displaying romanized text: {str(e)}")
641
  else:
642
  logger.warning("No filename provided for Romanized text")
643
  st.error("No filename provided for Romanized text")