Spaces:

polygraf-ai
/

article_writer

Runtime error

App Files Files Community

minko186 commited on Aug 23, 2024

Commit

a62cc34

1 Parent(s): e2a79fa

remove content_string (not used) + clean unicode non-printable chars + add pymupdf reading for pdf urls

Browse files

Files changed (2) hide show

app.py +0 -13
google_search.py +24 -3

app.py CHANGED Viewed

@@ -330,7 +330,6 @@ def generate_article(
     num_examples: str,
     conclusion_type: str,
     ai_model: str,
-    content_string: str,
     url_content: str = None,
     api_key: str = None,
     pdf_file_input: list[str] = None,
@@ -352,7 +351,6 @@ def generate_article(
         "references": [r.strip() for r in references.split(",")],
         "num_examples": num_examples,
         "conclusion_type": conclusion_type,
-        "sources": content_string,
         "generated_article": generated_article,
         "user_comments": user_comments,
     }
@@ -475,7 +473,6 @@ def save_to_cloud_storage(
     num_examples,
     conclusion_type,
     ai_model,
-    content_string,
     url_content,
     generated_article,
     user_comments,
@@ -508,7 +505,6 @@ def save_to_cloud_storage(
             "num_examples": num_examples,
             "conclusion_type": conclusion_type,
             "ai_model": ai_model,
-            "content_string": content_string,
             "url_content": url_content,
             "generated_article": generated_article,
             "user_comments": user_comments,
@@ -558,7 +554,6 @@ def generate_and_format(
     generated_article: str = None,
     user_comments: str = None,
 ):
-    content_string = ""
     url_content = None
     if google_search_check:
         date_from = build_date(year_from, month_from, day_from)
@@ -573,12 +568,6 @@ def generate_and_format(
             final_query += " " + " ".join(exclude_queries)
         print(f"Google Search Query: {final_query}")
         url_content = google_search(final_query, sorted_date, domains_to_include)
-        content_string = "\n".join(
-            f"{url.strip()}: \n{content.strip()[:2500]}" for url, content in url_content.items()
-        )
-        content_string = (
-            "Use the trusted information here from the URLs and add them as References:\n" + content_string
-        )
     topic_context = topic + ", " + context
     article = generate_article(
         input_role,
@@ -596,7 +585,6 @@ def generate_and_format(
         num_examples,
         conclusion_type,
         ai_model,
-        content_string,
         url_content,
         api_key,
         pdf_file_input,
@@ -631,7 +619,6 @@ def generate_and_format(
             num_examples,
             conclusion_type,
             ai_model,
-            content_string,
             url_content,
             generated_article,
             user_comments,

     num_examples: str,
     conclusion_type: str,
     ai_model: str,
     url_content: str = None,
     api_key: str = None,
     pdf_file_input: list[str] = None,
         "references": [r.strip() for r in references.split(",")],
         "num_examples": num_examples,
         "conclusion_type": conclusion_type,
         "generated_article": generated_article,
         "user_comments": user_comments,
     }
     num_examples,
     conclusion_type,
     ai_model,
     url_content,
     generated_article,
     user_comments,
             "num_examples": num_examples,
             "conclusion_type": conclusion_type,
             "ai_model": ai_model,
             "url_content": url_content,
             "generated_article": generated_article,
             "user_comments": user_comments,
     generated_article: str = None,
     user_comments: str = None,
 ):
     url_content = None
     if google_search_check:
         date_from = build_date(year_from, month_from, day_from)
             final_query += " " + " ".join(exclude_queries)
         print(f"Google Search Query: {final_query}")
         url_content = google_search(final_query, sorted_date, domains_to_include)
     topic_context = topic + ", " + context
     article = generate_article(
         input_role,
         num_examples,
         conclusion_type,
         ai_model,
         url_content,
         api_key,
         pdf_file_input,
             num_examples,
             conclusion_type,
             ai_model,
             url_content,
             generated_article,
             user_comments,

google_search.py CHANGED Viewed

@@ -7,6 +7,8 @@ from bs4 import BeautifulSoup
 from dotenv import load_dotenv
 import html2text
 import requests
 load_dotenv()
@@ -31,7 +33,9 @@ h2t.default_image_alt = "[image]"  # Default alt text for images
 def clean_html(text):
-    return h2t.handle(text)
 def build_results_beautifulsoup(url_list):
@@ -119,12 +123,29 @@ async def get_url_data(url, client):
     try:
         r = await client.get(url)
         if r.status_code == 200:
-            soup = BeautifulSoup(r.content, "html.parser")
-            return soup
     except Exception:
         return None
 async def parallel_scrap(urls):
     async with httpx.AsyncClient(timeout=30) as client:
         tasks = []

 from dotenv import load_dotenv
 import html2text
 import requests
+import unicodedata
+import fitz
 load_dotenv()
 def clean_html(text):
+    text = h2t.handle(text)
+    text = unicodedata.normalize("NFKD", text).encode("ASCII", "ignore").decode("ASCII")  # Remove non-ASCII characters
+    return text
 def build_results_beautifulsoup(url_list):
     try:
         r = await client.get(url)
         if r.status_code == 200:
+            content_type = r.headers.get("Content-Type", "").lower()
+            # detect if pdf
+            if "application/pdf" in content_type or url.lower().endswith(".pdf"):
+                pdf_content = await extract_pdf_text(r.content)
+                return BeautifulSoup(pdf_content, "html.parser")
+            else:
+                return BeautifulSoup(r.content, "html.parser")
     except Exception:
         return None
+async def extract_pdf_text(content):
+    try:
+        with fitz.open(stream=content, filetype="pdf") as doc:
+            text = ""
+            for page in doc:
+                text += page.get_text()
+        return f"<div>{text}</div>"  # Wrap in a div to make it valid HTML
+    except Exception as e:
+        print(f"Error extracting PDF text: {str(e)}")
+        return "<div>Error extracting PDF text</div>"
 async def parallel_scrap(urls):
     async with httpx.AsyncClient(timeout=30) as client:
         tasks = []