Update app.py
Browse files
app.py
CHANGED
@@ -12,6 +12,7 @@ import fitz
|
|
12 |
import PyPDF2
|
13 |
import gradio
|
14 |
import sys
|
|
|
15 |
from pathlib import Path
|
16 |
utils_dir = Path(__file__).parent / 'utils'
|
17 |
sys.path.append(str(utils_dir))
|
@@ -19,9 +20,13 @@ from openai_utils import *
|
|
19 |
import base64
|
20 |
from pdf2image import convert_from_bytes
|
21 |
import requests
|
|
|
|
|
|
|
|
|
22 |
PRIVATE_API_KEY = os.getenv('PRIVATE_API_KEY')
|
23 |
PRIVATE_API_BASE = os.getenv('PRIVATE_API_BASE')
|
24 |
-
|
25 |
|
26 |
def insert_sentence(text, sentence, interval):
|
27 |
lines = text.split('\n')
|
@@ -44,7 +49,18 @@ def insert_sentence(text, sentence, interval):
|
|
44 |
new_lines.append(separator.join(new_words))
|
45 |
|
46 |
return '\n'.join(new_lines)
|
47 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
def search_paper(query):
|
49 |
SEMANTIC_SCHOLAR_API_URL = "https://api.semanticscholar.org/graph/v1/paper/"
|
50 |
url = f"{SEMANTIC_SCHOLAR_API_URL}search?query={query}&limit=3&fields=url,title,abstract&fieldsOfStudy=Computer Science"
|
@@ -57,10 +73,21 @@ def search_paper(query):
|
|
57 |
|
58 |
return response.json()
|
59 |
|
60 |
-
def
|
61 |
-
|
62 |
-
|
63 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
64 |
|
65 |
def download_pdf(paper):
|
66 |
pdf_url = paper["openAccessPdf"]["url"]
|
@@ -70,8 +97,7 @@ def download_pdf(paper):
|
|
70 |
|
71 |
|
72 |
file_object = BytesIO(response.content)
|
73 |
-
|
74 |
-
chunks = split_text_into_chunks(extract_text)
|
75 |
return chunks
|
76 |
except:
|
77 |
return []
|
@@ -79,7 +105,7 @@ def download_pdf(paper):
|
|
79 |
|
80 |
def recommendation(s2_id, limit=500):
|
81 |
SEMANTIC_SCHOLAR_API_URL = "https://api.semanticscholar.org/recommendations/v1/papers/forpaper/"
|
82 |
-
url = f"{SEMANTIC_SCHOLAR_API_URL}{s2_id}?limit={limit}&fields=url,title,abstract,publicationDate,isOpenAccess,openAccessPdf"
|
83 |
|
84 |
# print(url)
|
85 |
response = requests.get(url)
|
@@ -92,22 +118,20 @@ def recommendation(s2_id, limit=500):
|
|
92 |
|
93 |
|
94 |
def extract_chapter(file_object):
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
break
|
110 |
-
return extracted_text
|
111 |
|
112 |
|
113 |
|
@@ -138,7 +162,8 @@ class Reviewer:
|
|
138 |
for paper in papers:
|
139 |
retrieval_content += f"Relevant Paper {str(cnt)}:\n"
|
140 |
retrieval_content += f"Title: {paper['title']}\n{paper['content']}\n\n"
|
141 |
-
|
|
|
142 |
cnt += 1
|
143 |
text = retrieval_content + content
|
144 |
chat_review_text = self.chat_review(text=text)
|
@@ -215,8 +240,8 @@ class Reviewer:
|
|
215 |
return rec_papers
|
216 |
|
217 |
def extract_related_content(self, papers, aspect):
|
218 |
-
os.environ["OPENAI_BASE_URL"] =
|
219 |
-
os.environ["OPENAI_API_KEY"] =
|
220 |
client = AsyncOpenAI()
|
221 |
|
222 |
messages = []
|
@@ -248,7 +273,7 @@ class Reviewer:
|
|
248 |
)
|
249 |
)
|
250 |
|
251 |
-
paper_data_list = [{"title": paper["title"], "content": ""} for paper in papers]
|
252 |
|
253 |
for (paper_idx, chunk_idx), response in zip(chunk_index_map, responses):
|
254 |
if response.strip().lower().startswith("yes"):
|
@@ -314,7 +339,7 @@ Organize the result in JSON format as follows:
|
|
314 |
for paper_data, response in zip(paper_data_list, responses):
|
315 |
# print(response)
|
316 |
response = json.loads(response)
|
317 |
-
results.append({"title": paper_data["title"], "content": response["revised_text"]})
|
318 |
return results
|
319 |
|
320 |
|
@@ -372,7 +397,7 @@ Organize the result in JSON format as follows:
|
|
372 |
result = ""
|
373 |
limit_cnt = 1
|
374 |
for limitation in limitations:
|
375 |
-
result += f"{str(limit_cnt)}. {limitation}\n"
|
376 |
limit_cnt += 1
|
377 |
# for choice in response.choices:
|
378 |
# result += choice.message.content
|
@@ -390,7 +415,7 @@ Organize the result in JSON format as follows:
|
|
390 |
query = title
|
391 |
search_results = search_paper(query)
|
392 |
if search_results != [] and search_results["data"][0]["title"].lower() == title.lower():
|
393 |
-
search_result = search_results[0]
|
394 |
retrieval = recommendation(search_result["paperId"])
|
395 |
recommended_paper_list = []
|
396 |
for recommended_paper in retrieval["recommendedPapers"]:
|
@@ -443,7 +468,7 @@ Organize the result in JSON format as follows:
|
|
443 |
file_object = BytesIO(pdf_path) # TODO
|
444 |
pdf_reader = PyPDF2.PdfReader(file_object)
|
445 |
|
446 |
-
doc = fitz.open(stream=pdf_path, filetype="pdf") # TODO
|
447 |
page = doc.load_page(0)
|
448 |
pix = page.get_pixmap()
|
449 |
image_bytes = pix.tobytes("png")
|
@@ -470,21 +495,21 @@ Organize the result in JSON format as follows:
|
|
470 |
title = response["title"]
|
471 |
abstract = response["abstract"]
|
472 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
473 |
|
|
|
|
|
|
|
|
|
474 |
|
475 |
-
num_pages = len(pdf_reader.pages)
|
476 |
-
extraction_started = False
|
477 |
-
extracted_text = ""
|
478 |
-
for page_number in range(num_pages):
|
479 |
-
page = pdf_reader.pages[page_number]
|
480 |
-
page_text = page.extract_text()
|
481 |
-
|
482 |
-
extraction_started = True
|
483 |
-
page_number_start = page_number
|
484 |
-
if extraction_started:
|
485 |
-
extracted_text += page_text
|
486 |
-
if page_number_start + 1 < page_number:
|
487 |
-
break
|
488 |
return extracted_text, title, abstract
|
489 |
|
490 |
def main(api,api_base, paper_pdf, aspect, model_name, limit_num, enable_rag):
|
@@ -511,9 +536,6 @@ def main(api,api_base, paper_pdf, aspect, model_name, limit_num, enable_rag):
|
|
511 |
return retrieved_content, comments, output2
|
512 |
|
513 |
|
514 |
-
|
515 |
-
|
516 |
-
|
517 |
########################################################################################################
|
518 |
|
519 |
title = "LimitGen"
|
|
|
12 |
import PyPDF2
|
13 |
import gradio
|
14 |
import sys
|
15 |
+
from mistralai import Mistral, DocumentURLChunk, ImageURLChunk, TextChunk, OCRResponse
|
16 |
from pathlib import Path
|
17 |
utils_dir = Path(__file__).parent / 'utils'
|
18 |
sys.path.append(str(utils_dir))
|
|
|
20 |
import base64
|
21 |
from pdf2image import convert_from_bytes
|
22 |
import requests
|
23 |
+
import bibtexparser
|
24 |
+
from pybtex.database import parse_string
|
25 |
+
from pybtex.plugin import find_plugin
|
26 |
+
|
27 |
PRIVATE_API_KEY = os.getenv('PRIVATE_API_KEY')
|
28 |
PRIVATE_API_BASE = os.getenv('PRIVATE_API_BASE')
|
29 |
+
MISTRAL_API = os.getenv('MISTRAL_API')
|
30 |
|
31 |
def insert_sentence(text, sentence, interval):
|
32 |
lines = text.split('\n')
|
|
|
49 |
new_lines.append(separator.join(new_words))
|
50 |
|
51 |
return '\n'.join(new_lines)
|
52 |
+
|
53 |
+
|
54 |
+
def format_bibtex(paper, style='apa'):
|
55 |
+
bibtex_entry = paper["citationStyles"]["bibtex"]
|
56 |
+
bib_data = parse_string(bibtex_entry, 'bibtex')
|
57 |
+
formatter = find_plugin('pybtex.style.formatting', style)()
|
58 |
+
entries = list(bib_data.entries.values())
|
59 |
+
if not entries:
|
60 |
+
return "No valid entries found."
|
61 |
+
formatted_entry = formatter.format_entries(entries)
|
62 |
+
return '\n'.join(entry.text.render_as('text') for entry in formatted_entry)
|
63 |
+
|
64 |
def search_paper(query):
|
65 |
SEMANTIC_SCHOLAR_API_URL = "https://api.semanticscholar.org/graph/v1/paper/"
|
66 |
url = f"{SEMANTIC_SCHOLAR_API_URL}search?query={query}&limit=3&fields=url,title,abstract&fieldsOfStudy=Computer Science"
|
|
|
73 |
|
74 |
return response.json()
|
75 |
|
76 |
+
def get_combined_markdown(pdf_response: OCRResponse) -> str:
|
77 |
+
markdowns: list[str] = []
|
78 |
+
for page in pdf_response.pages:
|
79 |
+
markdowns.append(page.markdown)
|
80 |
+
|
81 |
+
return "\n\n".join(markdowns)
|
82 |
+
|
83 |
+
def split_text_into_chunks(pdf_response: OCRResponse) -> str:
|
84 |
+
# words = text.split()
|
85 |
+
# chunks = [" ".join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
|
86 |
+
# return chunks
|
87 |
+
markdowns: list[str] = []
|
88 |
+
for page in pdf_response.pages:
|
89 |
+
markdowns.append(page.markdown)
|
90 |
+
return markdowns
|
91 |
|
92 |
def download_pdf(paper):
|
93 |
pdf_url = paper["openAccessPdf"]["url"]
|
|
|
97 |
|
98 |
|
99 |
file_object = BytesIO(response.content)
|
100 |
+
chunks = extract_chapter(file_object)
|
|
|
101 |
return chunks
|
102 |
except:
|
103 |
return []
|
|
|
105 |
|
106 |
def recommendation(s2_id, limit=500):
|
107 |
SEMANTIC_SCHOLAR_API_URL = "https://api.semanticscholar.org/recommendations/v1/papers/forpaper/"
|
108 |
+
url = f"{SEMANTIC_SCHOLAR_API_URL}{s2_id}?limit={limit}&fields=url,title,abstract,publicationDate,isOpenAccess,openAccessPdf,citationStyles"
|
109 |
|
110 |
# print(url)
|
111 |
response = requests.get(url)
|
|
|
118 |
|
119 |
|
120 |
def extract_chapter(file_object):
|
121 |
+
client = Mistral(api_key=MISTRAL_API)
|
122 |
+
uploaded_file = client.files.upload(
|
123 |
+
file={
|
124 |
+
"file_name": "retrieve.pdf",
|
125 |
+
"content": file_object.read(),
|
126 |
+
},
|
127 |
+
purpose="ocr",
|
128 |
+
)
|
129 |
+
|
130 |
+
signed_url = client.files.get_signed_url(file_id=uploaded_file.id, expiry=1)
|
131 |
+
pdf_response = client.ocr.process(document=DocumentURLChunk(document_url=signed_url.url), model="mistral-ocr-latest", include_image_base64=True)
|
132 |
+
# response_dict = json.loads(pdf_response.json())
|
133 |
+
chunks = split_text_into_chunks(pdf_response)
|
134 |
+
return chunks
|
|
|
|
|
135 |
|
136 |
|
137 |
|
|
|
162 |
for paper in papers:
|
163 |
retrieval_content += f"Relevant Paper {str(cnt)}:\n"
|
164 |
retrieval_content += f"Title: {paper['title']}\n{paper['content']}\n\n"
|
165 |
+
formatted_citation = format_bibtex(paper, 'unsrt')
|
166 |
+
retrieved_papers += f"{str(cnt)}. {formatted_citation} ({paper['url']})\n\n"
|
167 |
cnt += 1
|
168 |
text = retrieval_content + content
|
169 |
chat_review_text = self.chat_review(text=text)
|
|
|
240 |
return rec_papers
|
241 |
|
242 |
def extract_related_content(self, papers, aspect):
|
243 |
+
os.environ["OPENAI_BASE_URL"] = PRIVATE_API_BASE
|
244 |
+
os.environ["OPENAI_API_KEY"] = PRIVATE_API_KEY
|
245 |
client = AsyncOpenAI()
|
246 |
|
247 |
messages = []
|
|
|
273 |
)
|
274 |
)
|
275 |
|
276 |
+
paper_data_list = [{"title": paper["title"], "content": "", "citationStyles": paper["citationStyles"], "url": paper["url"]} for paper in papers]
|
277 |
|
278 |
for (paper_idx, chunk_idx), response in zip(chunk_index_map, responses):
|
279 |
if response.strip().lower().startswith("yes"):
|
|
|
339 |
for paper_data, response in zip(paper_data_list, responses):
|
340 |
# print(response)
|
341 |
response = json.loads(response)
|
342 |
+
results.append({"title": paper_data["title"], "content": response["revised_text"], "citationStyles": paper_data["citationStyles"], "url": paper_data["url"]})
|
343 |
return results
|
344 |
|
345 |
|
|
|
397 |
result = ""
|
398 |
limit_cnt = 1
|
399 |
for limitation in limitations:
|
400 |
+
result += f"{str(limit_cnt)}. {limitation}\n\n"
|
401 |
limit_cnt += 1
|
402 |
# for choice in response.choices:
|
403 |
# result += choice.message.content
|
|
|
415 |
query = title
|
416 |
search_results = search_paper(query)
|
417 |
if search_results != [] and search_results["data"][0]["title"].lower() == title.lower():
|
418 |
+
search_result = search_results["data"][0]
|
419 |
retrieval = recommendation(search_result["paperId"])
|
420 |
recommended_paper_list = []
|
421 |
for recommended_paper in retrieval["recommendedPapers"]:
|
|
|
468 |
file_object = BytesIO(pdf_path) # TODO
|
469 |
pdf_reader = PyPDF2.PdfReader(file_object)
|
470 |
|
471 |
+
doc = fitz.open(stream=pdf_path, filetype="pdf") # TODO path/bytes
|
472 |
page = doc.load_page(0)
|
473 |
pix = page.get_pixmap()
|
474 |
image_bytes = pix.tobytes("png")
|
|
|
495 |
title = response["title"]
|
496 |
abstract = response["abstract"]
|
497 |
|
498 |
+
client = Mistral(api_key=MISTRAL_API)
|
499 |
+
file_object.seek(0)
|
500 |
+
uploaded_file = client.files.upload(
|
501 |
+
file={
|
502 |
+
"file_name": "upload.pdf",
|
503 |
+
"content": file_object.read(),
|
504 |
+
},
|
505 |
+
purpose="ocr",
|
506 |
+
)
|
507 |
|
508 |
+
signed_url = client.files.get_signed_url(file_id=uploaded_file.id, expiry=1)
|
509 |
+
pdf_response = client.ocr.process(document=DocumentURLChunk(document_url=signed_url.url), model="mistral-ocr-latest", include_image_base64=True)
|
510 |
+
# response_dict = json.loads(pdf_response.json())
|
511 |
+
extracted_text = get_combined_markdown(pdf_response)
|
512 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
513 |
return extracted_text, title, abstract
|
514 |
|
515 |
def main(api,api_base, paper_pdf, aspect, model_name, limit_num, enable_rag):
|
|
|
536 |
return retrieved_content, comments, output2
|
537 |
|
538 |
|
|
|
|
|
|
|
539 |
########################################################################################################
|
540 |
|
541 |
title = "LimitGen"
|