minko186 commited on
Commit
a62cc34
·
1 Parent(s): e2a79fa

remove content_string (not used) + clean unicode non-printable chars + add pymupdf reading for pdf urls

Browse files
Files changed (2) hide show
  1. app.py +0 -13
  2. google_search.py +24 -3
app.py CHANGED
@@ -330,7 +330,6 @@ def generate_article(
330
  num_examples: str,
331
  conclusion_type: str,
332
  ai_model: str,
333
- content_string: str,
334
  url_content: str = None,
335
  api_key: str = None,
336
  pdf_file_input: list[str] = None,
@@ -352,7 +351,6 @@ def generate_article(
352
  "references": [r.strip() for r in references.split(",")],
353
  "num_examples": num_examples,
354
  "conclusion_type": conclusion_type,
355
- "sources": content_string,
356
  "generated_article": generated_article,
357
  "user_comments": user_comments,
358
  }
@@ -475,7 +473,6 @@ def save_to_cloud_storage(
475
  num_examples,
476
  conclusion_type,
477
  ai_model,
478
- content_string,
479
  url_content,
480
  generated_article,
481
  user_comments,
@@ -508,7 +505,6 @@ def save_to_cloud_storage(
508
  "num_examples": num_examples,
509
  "conclusion_type": conclusion_type,
510
  "ai_model": ai_model,
511
- "content_string": content_string,
512
  "url_content": url_content,
513
  "generated_article": generated_article,
514
  "user_comments": user_comments,
@@ -558,7 +554,6 @@ def generate_and_format(
558
  generated_article: str = None,
559
  user_comments: str = None,
560
  ):
561
- content_string = ""
562
  url_content = None
563
  if google_search_check:
564
  date_from = build_date(year_from, month_from, day_from)
@@ -573,12 +568,6 @@ def generate_and_format(
573
  final_query += " " + " ".join(exclude_queries)
574
  print(f"Google Search Query: {final_query}")
575
  url_content = google_search(final_query, sorted_date, domains_to_include)
576
- content_string = "\n".join(
577
- f"{url.strip()}: \n{content.strip()[:2500]}" for url, content in url_content.items()
578
- )
579
- content_string = (
580
- "Use the trusted information here from the URLs and add them as References:\n" + content_string
581
- )
582
  topic_context = topic + ", " + context
583
  article = generate_article(
584
  input_role,
@@ -596,7 +585,6 @@ def generate_and_format(
596
  num_examples,
597
  conclusion_type,
598
  ai_model,
599
- content_string,
600
  url_content,
601
  api_key,
602
  pdf_file_input,
@@ -631,7 +619,6 @@ def generate_and_format(
631
  num_examples,
632
  conclusion_type,
633
  ai_model,
634
- content_string,
635
  url_content,
636
  generated_article,
637
  user_comments,
 
330
  num_examples: str,
331
  conclusion_type: str,
332
  ai_model: str,
 
333
  url_content: str = None,
334
  api_key: str = None,
335
  pdf_file_input: list[str] = None,
 
351
  "references": [r.strip() for r in references.split(",")],
352
  "num_examples": num_examples,
353
  "conclusion_type": conclusion_type,
 
354
  "generated_article": generated_article,
355
  "user_comments": user_comments,
356
  }
 
473
  num_examples,
474
  conclusion_type,
475
  ai_model,
 
476
  url_content,
477
  generated_article,
478
  user_comments,
 
505
  "num_examples": num_examples,
506
  "conclusion_type": conclusion_type,
507
  "ai_model": ai_model,
 
508
  "url_content": url_content,
509
  "generated_article": generated_article,
510
  "user_comments": user_comments,
 
554
  generated_article: str = None,
555
  user_comments: str = None,
556
  ):
 
557
  url_content = None
558
  if google_search_check:
559
  date_from = build_date(year_from, month_from, day_from)
 
568
  final_query += " " + " ".join(exclude_queries)
569
  print(f"Google Search Query: {final_query}")
570
  url_content = google_search(final_query, sorted_date, domains_to_include)
 
 
 
 
 
 
571
  topic_context = topic + ", " + context
572
  article = generate_article(
573
  input_role,
 
585
  num_examples,
586
  conclusion_type,
587
  ai_model,
 
588
  url_content,
589
  api_key,
590
  pdf_file_input,
 
619
  num_examples,
620
  conclusion_type,
621
  ai_model,
 
622
  url_content,
623
  generated_article,
624
  user_comments,
google_search.py CHANGED
@@ -7,6 +7,8 @@ from bs4 import BeautifulSoup
7
  from dotenv import load_dotenv
8
  import html2text
9
  import requests
 
 
10
 
11
  load_dotenv()
12
 
@@ -31,7 +33,9 @@ h2t.default_image_alt = "[image]" # Default alt text for images
31
 
32
 
33
  def clean_html(text):
34
- return h2t.handle(text)
 
 
35
 
36
 
37
  def build_results_beautifulsoup(url_list):
@@ -119,12 +123,29 @@ async def get_url_data(url, client):
119
  try:
120
  r = await client.get(url)
121
  if r.status_code == 200:
122
- soup = BeautifulSoup(r.content, "html.parser")
123
- return soup
 
 
 
 
 
124
  except Exception:
125
  return None
126
 
127
 
 
 
 
 
 
 
 
 
 
 
 
 
128
  async def parallel_scrap(urls):
129
  async with httpx.AsyncClient(timeout=30) as client:
130
  tasks = []
 
7
  from dotenv import load_dotenv
8
  import html2text
9
  import requests
10
+ import unicodedata
11
+ import fitz
12
 
13
  load_dotenv()
14
 
 
33
 
34
 
35
  def clean_html(text):
36
+ text = h2t.handle(text)
37
+ text = unicodedata.normalize("NFKD", text).encode("ASCII", "ignore").decode("ASCII") # Remove non-ASCII characters
38
+ return text
39
 
40
 
41
  def build_results_beautifulsoup(url_list):
 
123
  try:
124
  r = await client.get(url)
125
  if r.status_code == 200:
126
+ content_type = r.headers.get("Content-Type", "").lower()
127
+ # detect if pdf
128
+ if "application/pdf" in content_type or url.lower().endswith(".pdf"):
129
+ pdf_content = await extract_pdf_text(r.content)
130
+ return BeautifulSoup(pdf_content, "html.parser")
131
+ else:
132
+ return BeautifulSoup(r.content, "html.parser")
133
  except Exception:
134
  return None
135
 
136
 
137
+ async def extract_pdf_text(content):
138
+ try:
139
+ with fitz.open(stream=content, filetype="pdf") as doc:
140
+ text = ""
141
+ for page in doc:
142
+ text += page.get_text()
143
+ return f"<div>{text}</div>" # Wrap in a div to make it valid HTML
144
+ except Exception as e:
145
+ print(f"Error extracting PDF text: {str(e)}")
146
+ return "<div>Error extracting PDF text</div>"
147
+
148
+
149
  async def parallel_scrap(urls):
150
  async with httpx.AsyncClient(timeout=30) as client:
151
  tasks = []