Spaces:
Runtime error
Runtime error
remove content_string (not used) + clean unicode non-printable chars + add pymupdf reading for pdf urls
Browse files- app.py +0 -13
- google_search.py +24 -3
app.py
CHANGED
@@ -330,7 +330,6 @@ def generate_article(
|
|
330 |
num_examples: str,
|
331 |
conclusion_type: str,
|
332 |
ai_model: str,
|
333 |
-
content_string: str,
|
334 |
url_content: str = None,
|
335 |
api_key: str = None,
|
336 |
pdf_file_input: list[str] = None,
|
@@ -352,7 +351,6 @@ def generate_article(
|
|
352 |
"references": [r.strip() for r in references.split(",")],
|
353 |
"num_examples": num_examples,
|
354 |
"conclusion_type": conclusion_type,
|
355 |
-
"sources": content_string,
|
356 |
"generated_article": generated_article,
|
357 |
"user_comments": user_comments,
|
358 |
}
|
@@ -475,7 +473,6 @@ def save_to_cloud_storage(
|
|
475 |
num_examples,
|
476 |
conclusion_type,
|
477 |
ai_model,
|
478 |
-
content_string,
|
479 |
url_content,
|
480 |
generated_article,
|
481 |
user_comments,
|
@@ -508,7 +505,6 @@ def save_to_cloud_storage(
|
|
508 |
"num_examples": num_examples,
|
509 |
"conclusion_type": conclusion_type,
|
510 |
"ai_model": ai_model,
|
511 |
-
"content_string": content_string,
|
512 |
"url_content": url_content,
|
513 |
"generated_article": generated_article,
|
514 |
"user_comments": user_comments,
|
@@ -558,7 +554,6 @@ def generate_and_format(
|
|
558 |
generated_article: str = None,
|
559 |
user_comments: str = None,
|
560 |
):
|
561 |
-
content_string = ""
|
562 |
url_content = None
|
563 |
if google_search_check:
|
564 |
date_from = build_date(year_from, month_from, day_from)
|
@@ -573,12 +568,6 @@ def generate_and_format(
|
|
573 |
final_query += " " + " ".join(exclude_queries)
|
574 |
print(f"Google Search Query: {final_query}")
|
575 |
url_content = google_search(final_query, sorted_date, domains_to_include)
|
576 |
-
content_string = "\n".join(
|
577 |
-
f"{url.strip()}: \n{content.strip()[:2500]}" for url, content in url_content.items()
|
578 |
-
)
|
579 |
-
content_string = (
|
580 |
-
"Use the trusted information here from the URLs and add them as References:\n" + content_string
|
581 |
-
)
|
582 |
topic_context = topic + ", " + context
|
583 |
article = generate_article(
|
584 |
input_role,
|
@@ -596,7 +585,6 @@ def generate_and_format(
|
|
596 |
num_examples,
|
597 |
conclusion_type,
|
598 |
ai_model,
|
599 |
-
content_string,
|
600 |
url_content,
|
601 |
api_key,
|
602 |
pdf_file_input,
|
@@ -631,7 +619,6 @@ def generate_and_format(
|
|
631 |
num_examples,
|
632 |
conclusion_type,
|
633 |
ai_model,
|
634 |
-
content_string,
|
635 |
url_content,
|
636 |
generated_article,
|
637 |
user_comments,
|
|
|
330 |
num_examples: str,
|
331 |
conclusion_type: str,
|
332 |
ai_model: str,
|
|
|
333 |
url_content: str = None,
|
334 |
api_key: str = None,
|
335 |
pdf_file_input: list[str] = None,
|
|
|
351 |
"references": [r.strip() for r in references.split(",")],
|
352 |
"num_examples": num_examples,
|
353 |
"conclusion_type": conclusion_type,
|
|
|
354 |
"generated_article": generated_article,
|
355 |
"user_comments": user_comments,
|
356 |
}
|
|
|
473 |
num_examples,
|
474 |
conclusion_type,
|
475 |
ai_model,
|
|
|
476 |
url_content,
|
477 |
generated_article,
|
478 |
user_comments,
|
|
|
505 |
"num_examples": num_examples,
|
506 |
"conclusion_type": conclusion_type,
|
507 |
"ai_model": ai_model,
|
|
|
508 |
"url_content": url_content,
|
509 |
"generated_article": generated_article,
|
510 |
"user_comments": user_comments,
|
|
|
554 |
generated_article: str = None,
|
555 |
user_comments: str = None,
|
556 |
):
|
|
|
557 |
url_content = None
|
558 |
if google_search_check:
|
559 |
date_from = build_date(year_from, month_from, day_from)
|
|
|
568 |
final_query += " " + " ".join(exclude_queries)
|
569 |
print(f"Google Search Query: {final_query}")
|
570 |
url_content = google_search(final_query, sorted_date, domains_to_include)
|
|
|
|
|
|
|
|
|
|
|
|
|
571 |
topic_context = topic + ", " + context
|
572 |
article = generate_article(
|
573 |
input_role,
|
|
|
585 |
num_examples,
|
586 |
conclusion_type,
|
587 |
ai_model,
|
|
|
588 |
url_content,
|
589 |
api_key,
|
590 |
pdf_file_input,
|
|
|
619 |
num_examples,
|
620 |
conclusion_type,
|
621 |
ai_model,
|
|
|
622 |
url_content,
|
623 |
generated_article,
|
624 |
user_comments,
|
google_search.py
CHANGED
@@ -7,6 +7,8 @@ from bs4 import BeautifulSoup
|
|
7 |
from dotenv import load_dotenv
|
8 |
import html2text
|
9 |
import requests
|
|
|
|
|
10 |
|
11 |
load_dotenv()
|
12 |
|
@@ -31,7 +33,9 @@ h2t.default_image_alt = "[image]" # Default alt text for images
|
|
31 |
|
32 |
|
33 |
def clean_html(text):
|
34 |
-
|
|
|
|
|
35 |
|
36 |
|
37 |
def build_results_beautifulsoup(url_list):
|
@@ -119,12 +123,29 @@ async def get_url_data(url, client):
|
|
119 |
try:
|
120 |
r = await client.get(url)
|
121 |
if r.status_code == 200:
|
122 |
-
|
123 |
-
|
|
|
|
|
|
|
|
|
|
|
124 |
except Exception:
|
125 |
return None
|
126 |
|
127 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
128 |
async def parallel_scrap(urls):
|
129 |
async with httpx.AsyncClient(timeout=30) as client:
|
130 |
tasks = []
|
|
|
7 |
from dotenv import load_dotenv
|
8 |
import html2text
|
9 |
import requests
|
10 |
+
import unicodedata
|
11 |
+
import fitz
|
12 |
|
13 |
load_dotenv()
|
14 |
|
|
|
33 |
|
34 |
|
35 |
def clean_html(text):
|
36 |
+
text = h2t.handle(text)
|
37 |
+
text = unicodedata.normalize("NFKD", text).encode("ASCII", "ignore").decode("ASCII") # Remove non-ASCII characters
|
38 |
+
return text
|
39 |
|
40 |
|
41 |
def build_results_beautifulsoup(url_list):
|
|
|
123 |
try:
|
124 |
r = await client.get(url)
|
125 |
if r.status_code == 200:
|
126 |
+
content_type = r.headers.get("Content-Type", "").lower()
|
127 |
+
# detect if pdf
|
128 |
+
if "application/pdf" in content_type or url.lower().endswith(".pdf"):
|
129 |
+
pdf_content = await extract_pdf_text(r.content)
|
130 |
+
return BeautifulSoup(pdf_content, "html.parser")
|
131 |
+
else:
|
132 |
+
return BeautifulSoup(r.content, "html.parser")
|
133 |
except Exception:
|
134 |
return None
|
135 |
|
136 |
|
137 |
+
async def extract_pdf_text(content):
|
138 |
+
try:
|
139 |
+
with fitz.open(stream=content, filetype="pdf") as doc:
|
140 |
+
text = ""
|
141 |
+
for page in doc:
|
142 |
+
text += page.get_text()
|
143 |
+
return f"<div>{text}</div>" # Wrap in a div to make it valid HTML
|
144 |
+
except Exception as e:
|
145 |
+
print(f"Error extracting PDF text: {str(e)}")
|
146 |
+
return "<div>Error extracting PDF text</div>"
|
147 |
+
|
148 |
+
|
149 |
async def parallel_scrap(urls):
|
150 |
async with httpx.AsyncClient(timeout=30) as client:
|
151 |
tasks = []
|