zjXu11 commited on
Commit
0971eef
·
verified ·
1 Parent(s): 0367a28

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +71 -49
app.py CHANGED
@@ -12,6 +12,7 @@ import fitz
12
  import PyPDF2
13
  import gradio
14
  import sys
 
15
  from pathlib import Path
16
  utils_dir = Path(__file__).parent / 'utils'
17
  sys.path.append(str(utils_dir))
@@ -19,9 +20,13 @@ from openai_utils import *
19
  import base64
20
  from pdf2image import convert_from_bytes
21
  import requests
 
 
 
 
22
  PRIVATE_API_KEY = os.getenv('PRIVATE_API_KEY')
23
  PRIVATE_API_BASE = os.getenv('PRIVATE_API_BASE')
24
-
25
 
26
  def insert_sentence(text, sentence, interval):
27
  lines = text.split('\n')
@@ -44,7 +49,18 @@ def insert_sentence(text, sentence, interval):
44
  new_lines.append(separator.join(new_words))
45
 
46
  return '\n'.join(new_lines)
47
-
 
 
 
 
 
 
 
 
 
 
 
48
  def search_paper(query):
49
  SEMANTIC_SCHOLAR_API_URL = "https://api.semanticscholar.org/graph/v1/paper/"
50
  url = f"{SEMANTIC_SCHOLAR_API_URL}search?query={query}&limit=3&fields=url,title,abstract&fieldsOfStudy=Computer Science"
@@ -57,10 +73,21 @@ def search_paper(query):
57
 
58
  return response.json()
59
 
60
- def split_text_into_chunks(text, chunk_size=300):
61
- words = text.split()
62
- chunks = [" ".join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
63
- return chunks
 
 
 
 
 
 
 
 
 
 
 
64
 
65
  def download_pdf(paper):
66
  pdf_url = paper["openAccessPdf"]["url"]
@@ -70,8 +97,7 @@ def download_pdf(paper):
70
 
71
 
72
  file_object = BytesIO(response.content)
73
- extract_text = extract_chapter(file_object)
74
- chunks = split_text_into_chunks(extract_text)
75
  return chunks
76
  except:
77
  return []
@@ -79,7 +105,7 @@ def download_pdf(paper):
79
 
80
  def recommendation(s2_id, limit=500):
81
  SEMANTIC_SCHOLAR_API_URL = "https://api.semanticscholar.org/recommendations/v1/papers/forpaper/"
82
- url = f"{SEMANTIC_SCHOLAR_API_URL}{s2_id}?limit={limit}&fields=url,title,abstract,publicationDate,isOpenAccess,openAccessPdf"
83
 
84
  # print(url)
85
  response = requests.get(url)
@@ -92,22 +118,20 @@ def recommendation(s2_id, limit=500):
92
 
93
 
94
  def extract_chapter(file_object):
95
- pdf_reader = PyPDF2.PdfReader(file_object)
96
-
97
-
98
- num_pages = len(pdf_reader.pages)
99
- extraction_started = False
100
- extracted_text = ""
101
- for page_number in range(num_pages):
102
- page = pdf_reader.pages[page_number]
103
- page_text = page.extract_text()
104
- extraction_started = True
105
- page_number_start = page_number
106
- if extraction_started:
107
- extracted_text += page_text
108
- if page_number_start + 1 < page_number:
109
- break
110
- return extracted_text
111
 
112
 
113
 
@@ -138,7 +162,8 @@ class Reviewer:
138
  for paper in papers:
139
  retrieval_content += f"Relevant Paper {str(cnt)}:\n"
140
  retrieval_content += f"Title: {paper['title']}\n{paper['content']}\n\n"
141
- retrieved_papers += f"{str(cnt)}. {paper['title']}\n"
 
142
  cnt += 1
143
  text = retrieval_content + content
144
  chat_review_text = self.chat_review(text=text)
@@ -215,8 +240,8 @@ class Reviewer:
215
  return rec_papers
216
 
217
  def extract_related_content(self, papers, aspect):
218
- os.environ["OPENAI_BASE_URL"] = self.api_base
219
- os.environ["OPENAI_API_KEY"] = self.api
220
  client = AsyncOpenAI()
221
 
222
  messages = []
@@ -248,7 +273,7 @@ class Reviewer:
248
  )
249
  )
250
 
251
- paper_data_list = [{"title": paper["title"], "content": ""} for paper in papers]
252
 
253
  for (paper_idx, chunk_idx), response in zip(chunk_index_map, responses):
254
  if response.strip().lower().startswith("yes"):
@@ -314,7 +339,7 @@ Organize the result in JSON format as follows:
314
  for paper_data, response in zip(paper_data_list, responses):
315
  # print(response)
316
  response = json.loads(response)
317
- results.append({"title": paper_data["title"], "content": response["revised_text"]})
318
  return results
319
 
320
 
@@ -372,7 +397,7 @@ Organize the result in JSON format as follows:
372
  result = ""
373
  limit_cnt = 1
374
  for limitation in limitations:
375
- result += f"{str(limit_cnt)}. {limitation}\n"
376
  limit_cnt += 1
377
  # for choice in response.choices:
378
  # result += choice.message.content
@@ -390,7 +415,7 @@ Organize the result in JSON format as follows:
390
  query = title
391
  search_results = search_paper(query)
392
  if search_results != [] and search_results["data"][0]["title"].lower() == title.lower():
393
- search_result = search_results[0]
394
  retrieval = recommendation(search_result["paperId"])
395
  recommended_paper_list = []
396
  for recommended_paper in retrieval["recommendedPapers"]:
@@ -443,7 +468,7 @@ Organize the result in JSON format as follows:
443
  file_object = BytesIO(pdf_path) # TODO
444
  pdf_reader = PyPDF2.PdfReader(file_object)
445
 
446
- doc = fitz.open(stream=pdf_path, filetype="pdf") # TODO
447
  page = doc.load_page(0)
448
  pix = page.get_pixmap()
449
  image_bytes = pix.tobytes("png")
@@ -470,21 +495,21 @@ Organize the result in JSON format as follows:
470
  title = response["title"]
471
  abstract = response["abstract"]
472
 
 
 
 
 
 
 
 
 
 
473
 
 
 
 
 
474
 
475
- num_pages = len(pdf_reader.pages)
476
- extraction_started = False
477
- extracted_text = ""
478
- for page_number in range(num_pages):
479
- page = pdf_reader.pages[page_number]
480
- page_text = page.extract_text()
481
-
482
- extraction_started = True
483
- page_number_start = page_number
484
- if extraction_started:
485
- extracted_text += page_text
486
- if page_number_start + 1 < page_number:
487
- break
488
  return extracted_text, title, abstract
489
 
490
  def main(api,api_base, paper_pdf, aspect, model_name, limit_num, enable_rag):
@@ -511,9 +536,6 @@ def main(api,api_base, paper_pdf, aspect, model_name, limit_num, enable_rag):
511
  return retrieved_content, comments, output2
512
 
513
 
514
-
515
-
516
-
517
  ########################################################################################################
518
 
519
  title = "LimitGen"
 
12
  import PyPDF2
13
  import gradio
14
  import sys
15
+ from mistralai import Mistral, DocumentURLChunk, ImageURLChunk, TextChunk, OCRResponse
16
  from pathlib import Path
17
  utils_dir = Path(__file__).parent / 'utils'
18
  sys.path.append(str(utils_dir))
 
20
  import base64
21
  from pdf2image import convert_from_bytes
22
  import requests
23
+ import bibtexparser
24
+ from pybtex.database import parse_string
25
+ from pybtex.plugin import find_plugin
26
+
27
  PRIVATE_API_KEY = os.getenv('PRIVATE_API_KEY')
28
  PRIVATE_API_BASE = os.getenv('PRIVATE_API_BASE')
29
+ MISTRAL_API = os.getenv('MISTRAL_API')
30
 
31
  def insert_sentence(text, sentence, interval):
32
  lines = text.split('\n')
 
49
  new_lines.append(separator.join(new_words))
50
 
51
  return '\n'.join(new_lines)
52
+
53
+
54
+ def format_bibtex(paper, style='apa'):
55
+ bibtex_entry = paper["citationStyles"]["bibtex"]
56
+ bib_data = parse_string(bibtex_entry, 'bibtex')
57
+ formatter = find_plugin('pybtex.style.formatting', style)()
58
+ entries = list(bib_data.entries.values())
59
+ if not entries:
60
+ return "No valid entries found."
61
+ formatted_entry = formatter.format_entries(entries)
62
+ return '\n'.join(entry.text.render_as('text') for entry in formatted_entry)
63
+
64
  def search_paper(query):
65
  SEMANTIC_SCHOLAR_API_URL = "https://api.semanticscholar.org/graph/v1/paper/"
66
  url = f"{SEMANTIC_SCHOLAR_API_URL}search?query={query}&limit=3&fields=url,title,abstract&fieldsOfStudy=Computer Science"
 
73
 
74
  return response.json()
75
 
76
+ def get_combined_markdown(pdf_response: OCRResponse) -> str:
77
+ markdowns: list[str] = []
78
+ for page in pdf_response.pages:
79
+ markdowns.append(page.markdown)
80
+
81
+ return "\n\n".join(markdowns)
82
+
83
+ def split_text_into_chunks(pdf_response: OCRResponse) -> str:
84
+ # words = text.split()
85
+ # chunks = [" ".join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
86
+ # return chunks
87
+ markdowns: list[str] = []
88
+ for page in pdf_response.pages:
89
+ markdowns.append(page.markdown)
90
+ return markdowns
91
 
92
  def download_pdf(paper):
93
  pdf_url = paper["openAccessPdf"]["url"]
 
97
 
98
 
99
  file_object = BytesIO(response.content)
100
+ chunks = extract_chapter(file_object)
 
101
  return chunks
102
  except:
103
  return []
 
105
 
106
  def recommendation(s2_id, limit=500):
107
  SEMANTIC_SCHOLAR_API_URL = "https://api.semanticscholar.org/recommendations/v1/papers/forpaper/"
108
+ url = f"{SEMANTIC_SCHOLAR_API_URL}{s2_id}?limit={limit}&fields=url,title,abstract,publicationDate,isOpenAccess,openAccessPdf,citationStyles"
109
 
110
  # print(url)
111
  response = requests.get(url)
 
118
 
119
 
120
  def extract_chapter(file_object):
121
+ client = Mistral(api_key=MISTRAL_API)
122
+ uploaded_file = client.files.upload(
123
+ file={
124
+ "file_name": "retrieve.pdf",
125
+ "content": file_object.read(),
126
+ },
127
+ purpose="ocr",
128
+ )
129
+
130
+ signed_url = client.files.get_signed_url(file_id=uploaded_file.id, expiry=1)
131
+ pdf_response = client.ocr.process(document=DocumentURLChunk(document_url=signed_url.url), model="mistral-ocr-latest", include_image_base64=True)
132
+ # response_dict = json.loads(pdf_response.json())
133
+ chunks = split_text_into_chunks(pdf_response)
134
+ return chunks
 
 
135
 
136
 
137
 
 
162
  for paper in papers:
163
  retrieval_content += f"Relevant Paper {str(cnt)}:\n"
164
  retrieval_content += f"Title: {paper['title']}\n{paper['content']}\n\n"
165
+ formatted_citation = format_bibtex(paper, 'unsrt')
166
+ retrieved_papers += f"{str(cnt)}. {formatted_citation} ({paper['url']})\n\n"
167
  cnt += 1
168
  text = retrieval_content + content
169
  chat_review_text = self.chat_review(text=text)
 
240
  return rec_papers
241
 
242
  def extract_related_content(self, papers, aspect):
243
+ os.environ["OPENAI_BASE_URL"] = PRIVATE_API_BASE
244
+ os.environ["OPENAI_API_KEY"] = PRIVATE_API_KEY
245
  client = AsyncOpenAI()
246
 
247
  messages = []
 
273
  )
274
  )
275
 
276
+ paper_data_list = [{"title": paper["title"], "content": "", "citationStyles": paper["citationStyles"], "url": paper["url"]} for paper in papers]
277
 
278
  for (paper_idx, chunk_idx), response in zip(chunk_index_map, responses):
279
  if response.strip().lower().startswith("yes"):
 
339
  for paper_data, response in zip(paper_data_list, responses):
340
  # print(response)
341
  response = json.loads(response)
342
+ results.append({"title": paper_data["title"], "content": response["revised_text"], "citationStyles": paper_data["citationStyles"], "url": paper_data["url"]})
343
  return results
344
 
345
 
 
397
  result = ""
398
  limit_cnt = 1
399
  for limitation in limitations:
400
+ result += f"{str(limit_cnt)}. {limitation}\n\n"
401
  limit_cnt += 1
402
  # for choice in response.choices:
403
  # result += choice.message.content
 
415
  query = title
416
  search_results = search_paper(query)
417
  if search_results != [] and search_results["data"][0]["title"].lower() == title.lower():
418
+ search_result = search_results["data"][0]
419
  retrieval = recommendation(search_result["paperId"])
420
  recommended_paper_list = []
421
  for recommended_paper in retrieval["recommendedPapers"]:
 
468
  file_object = BytesIO(pdf_path) # TODO
469
  pdf_reader = PyPDF2.PdfReader(file_object)
470
 
471
+ doc = fitz.open(stream=pdf_path, filetype="pdf") # TODO path/bytes
472
  page = doc.load_page(0)
473
  pix = page.get_pixmap()
474
  image_bytes = pix.tobytes("png")
 
495
  title = response["title"]
496
  abstract = response["abstract"]
497
 
498
+ client = Mistral(api_key=MISTRAL_API)
499
+ file_object.seek(0)
500
+ uploaded_file = client.files.upload(
501
+ file={
502
+ "file_name": "upload.pdf",
503
+ "content": file_object.read(),
504
+ },
505
+ purpose="ocr",
506
+ )
507
 
508
+ signed_url = client.files.get_signed_url(file_id=uploaded_file.id, expiry=1)
509
+ pdf_response = client.ocr.process(document=DocumentURLChunk(document_url=signed_url.url), model="mistral-ocr-latest", include_image_base64=True)
510
+ # response_dict = json.loads(pdf_response.json())
511
+ extracted_text = get_combined_markdown(pdf_response)
512
 
 
 
 
 
 
 
 
 
 
 
 
 
 
513
  return extracted_text, title, abstract
514
 
515
  def main(api,api_base, paper_pdf, aspect, model_name, limit_num, enable_rag):
 
536
  return retrieved_content, comments, output2
537
 
538
 
 
 
 
539
  ########################################################################################################
540
 
541
  title = "LimitGen"