Spaces:

mlokendra
/

pdf_to_poadcast

Running

App Files Files Community

mlokendra commited on Jul 1

Commit

cb5e453

verified ·

1 Parent(s): b7f263a

u

Browse files

Files changed (1) hide show

app.py +7 -72

app.py CHANGED Viewed

@@ -23,7 +23,9 @@ nest_asyncio.apply()
 generator = pipeline("text-generation",
                      model="unsloth/gemma-3-1b-it",
                      device_map='cpu',
-                     max_new_tokens=300)
 # Async function to get voices
 async def get_english_voices():
     voices = await VoicesManager.create()
@@ -56,7 +58,7 @@ KEY_TERMS = [
 def split_sentences(text):
     return re.split(r'(?<=[.!?])\s+', text.strip())
-def extract_sections_from_pdf_old(pdf_path):
     reader = PdfReader(pdf_path)
     full_text = "\n".join([page.extract_text() for page in reader.pages if page.extract_text()])
     full_text = re.sub(r'\n+', '\n', full_text)
@@ -69,12 +71,7 @@ def extract_sections_from_pdf_old(pdf_path):
         "third section continuing from Overview to methodology and no required to start introductuion between host & guest directly continue in discussion": r"\b(method(?:ology)?|proposed method|approach|model architecture|architecture|experimental setup|network design|implementation details|techniques|framework|learning algorithm|system description)\b",
         "fourth and the last section continuing from methodology to conclusion and no required to start introductuion between host & guest directly continue in discussion and this is the end of conversation so conclude add thank remarks": r"\bconclusion(?:s)?\b|\bsummary\b|final thoughts\b|result(?:s)?",
     }
-    section_patterns = {
-        "Start of podcast with first section of paper as abstract": r"^abstract\b",
-        "second section continuing from abstract to overview and no required to start introductuion between host & guest directly continue in discussion": r"^introduction\b|^overview\b",
-        "third section continuing from overview to methodology and no required to start introductuion between host & guest directly continue in discussion": r"^method(?:ology)?\b|^proposed method\b|^approach\b|^model architecture\b|^experimental setup\b|^network design\b",
-        "fourth and the last section continuing from methodology to conclusion and no required to start introductuion between host & guest directly continue in discussion and this is the end of conversation so conclude add thank remarks": r"^conclusion(?:s)?\b|^summary\b|^final thought(?:s)\b|^result(?:s)\b",
-    }
     sections = {}
     matches = []
@@ -96,66 +93,7 @@ def extract_sections_from_pdf_old(pdf_path):
     return sections,section_patterns
-# Define heading regex patterns
-SECTION_LABELS = {
-    "abstract": r"\babstract\b",
-    "introduction": r"\bintroduction\b",
-    "methodology": r"\b(method(?:ology)?|approach|model architecture|implementation|framework|experimental setup)\b",
-    "conclusion": r"\bconclusion(?:s)?\b|\bsummary\b|final thoughts\b|result(?:s)?",
-}
-def is_heading(span):
-    """Heuristic: if text is bold or font size is large, consider it heading"""
-    return span['size'] > 11 and span['font'].lower().find("bold") != -1
-def clean_text(text):
-    return re.sub(r'\s+', ' ', text.strip())
-def extract_sections_from_pdf(pdf_path):
-    doc = fitz.open(pdf_path)
-    headings = []
-    paragraphs = []
-    section_text_map = {}
-    # Extract headings and text blocks
-    for page in doc:
-        blocks = page.get_text("dict")["blocks"]
-        for block in blocks:
-            for line in block.get("lines", []):
-                for span in line["spans"]:
-                    txt = clean_text(span["text"])
-                    if len(txt) == 0:
-                        continue
-                    if is_heading(span):
-                        headings.append((txt, page.number))
-                    else:
-                        paragraphs.append((txt, page.number))
-    # Identify section labels via regex
-    labeled_headings = []
-    for txt, page in headings:
-        for label, pattern in SECTION_LABELS.items():
-            if re.search(pattern, txt, re.IGNORECASE):
-                labeled_headings.append((label, txt, page))
-    # Sort labeled headings by page number
-    labeled_headings.sort(key=lambda x: x[2])
-    # Slice paragraphs by heading regions
-    for i, (label, _, start_page) in enumerate(labeled_headings):
-        end_page = labeled_headings[i + 1][2] if i + 1 < len(labeled_headings) else doc.page_count
-        # Filter relevant paragraphs
-        section_paras = [
-            p[0] for p in paragraphs if start_page <= p[1] < end_page
-        ]
-        # Limit by 3–5 paragraphs for summarization efficiency
-        limited_text = "\n".join(section_paras[:5])
-        section_text_map[label] = limited_text
-    return section_text_map,SECTION_LABELS
 def extract_paragraphs(text, max_paragraphs=4):
     # Use double newlines if present
@@ -264,11 +202,8 @@ async def tts_edge_line_by_line(script):
             print(f"⚠️ Skipping corrupt or empty file: {filename}")
             continue
-        try:
-            segment = AudioSegment.from_mp3(filename)
-            segments.append(segment)
-        except CouldntDecodeError as e:
-            print(f"❌ Error decoding {filename}: {e}")
     return segments

 generator = pipeline("text-generation",
                      model="unsloth/gemma-3-1b-it",
                      device_map='cpu',
+                     max_new_tokens=350,
+                     do_sample=True,
+                     temperature=0.7,)
 # Async function to get voices
 async def get_english_voices():
     voices = await VoicesManager.create()
 def split_sentences(text):
     return re.split(r'(?<=[.!?])\s+', text.strip())
+def extract_sections_from_pdf(pdf_path):
     reader = PdfReader(pdf_path)
     full_text = "\n".join([page.extract_text() for page in reader.pages if page.extract_text()])
     full_text = re.sub(r'\n+', '\n', full_text)
         "third section continuing from Overview to methodology and no required to start introductuion between host & guest directly continue in discussion": r"\b(method(?:ology)?|proposed method|approach|model architecture|architecture|experimental setup|network design|implementation details|techniques|framework|learning algorithm|system description)\b",
         "fourth and the last section continuing from methodology to conclusion and no required to start introductuion between host & guest directly continue in discussion and this is the end of conversation so conclude add thank remarks": r"\bconclusion(?:s)?\b|\bsummary\b|final thoughts\b|result(?:s)?",
     }
     sections = {}
     matches = []
     return sections,section_patterns
 def extract_paragraphs(text, max_paragraphs=4):
     # Use double newlines if present
             print(f"⚠️ Skipping corrupt or empty file: {filename}")
             continue
+        segment = AudioSegment.from_mp3(filename)
+        segments.append(segment)
     return segments