Spaces:

siddhartharya
/

My_NotebookLM_Podcast_Generator

Running

siddhartharya commited on Sep 29, 2024

Commit

b5389b5

verified ·

1 Parent(s): 2a109d4

Update utils.py

Files changed (1) hide show

utils.py CHANGED Viewed

@@ -7,6 +7,8 @@ import json
 import re
 from gtts import gTTS
 import tempfile
 groq_client = Groq(api_key=os.environ["GROQ_API_KEY"])
 tokenizer = tiktoken.get_encoding("cl100k_base")
@@ -24,9 +26,27 @@ def truncate_text(text, max_tokens=2048):
         return tokenizer.decode(tokens[:max_tokens])
     return text
 def generate_script(system_prompt: str, input_text: str, tone: str, target_length: str):
     input_text = truncate_text(input_text)
-    word_limit = 300 if target_length == "Short (1-2 min)" else 750  # Assuming 150 words per minute
     prompt = f"""
     {system_prompt}

 import re
 from gtts import gTTS
 import tempfile
+import requests
+from bs4 import BeautifulSoup
 groq_client = Groq(api_key=os.environ["GROQ_API_KEY"])
 tokenizer = tiktoken.get_encoding("cl100k_base")
         return tokenizer.decode(tokens[:max_tokens])
     return text
+def extract_text_from_url(url):
+    try:
+        response = requests.get(url)
+        response.raise_for_status()
+        soup = BeautifulSoup(response.text, 'html.parser')
+        for script in soup(["script", "style"]):
+            script.decompose()
+        text = soup.get_text()
+        lines = (line.strip() for line in text.splitlines())
+        chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
+        text = '\n'.join(chunk for chunk in chunks if chunk)
+        return text
+    except Exception as e:
+        raise ValueError(f"Error extracting text from URL: {str(e)}")
 def generate_script(system_prompt: str, input_text: str, tone: str, target_length: str):
     input_text = truncate_text(input_text)
+    word_limit = 300 if target_length == "Short (1-2 min)" else 750
     prompt = f"""
     {system_prompt}