mlokendra commited on
Commit
cb5e453
·
verified ·
1 Parent(s): b7f263a
Files changed (1) hide show
  1. app.py +7 -72
app.py CHANGED
@@ -23,7 +23,9 @@ nest_asyncio.apply()
23
  generator = pipeline("text-generation",
24
  model="unsloth/gemma-3-1b-it",
25
  device_map='cpu',
26
- max_new_tokens=300)
 
 
27
  # Async function to get voices
28
  async def get_english_voices():
29
  voices = await VoicesManager.create()
@@ -56,7 +58,7 @@ KEY_TERMS = [
56
  def split_sentences(text):
57
  return re.split(r'(?<=[.!?])\s+', text.strip())
58
 
59
- def extract_sections_from_pdf_old(pdf_path):
60
  reader = PdfReader(pdf_path)
61
  full_text = "\n".join([page.extract_text() for page in reader.pages if page.extract_text()])
62
  full_text = re.sub(r'\n+', '\n', full_text)
@@ -69,12 +71,7 @@ def extract_sections_from_pdf_old(pdf_path):
69
  "third section continuing from Overview to methodology and no required to start introductuion between host & guest directly continue in discussion": r"\b(method(?:ology)?|proposed method|approach|model architecture|architecture|experimental setup|network design|implementation details|techniques|framework|learning algorithm|system description)\b",
70
  "fourth and the last section continuing from methodology to conclusion and no required to start introductuion between host & guest directly continue in discussion and this is the end of conversation so conclude add thank remarks": r"\bconclusion(?:s)?\b|\bsummary\b|final thoughts\b|result(?:s)?",
71
  }
72
- section_patterns = {
73
- "Start of podcast with first section of paper as abstract": r"^abstract\b",
74
- "second section continuing from abstract to overview and no required to start introductuion between host & guest directly continue in discussion": r"^introduction\b|^overview\b",
75
- "third section continuing from overview to methodology and no required to start introductuion between host & guest directly continue in discussion": r"^method(?:ology)?\b|^proposed method\b|^approach\b|^model architecture\b|^experimental setup\b|^network design\b",
76
- "fourth and the last section continuing from methodology to conclusion and no required to start introductuion between host & guest directly continue in discussion and this is the end of conversation so conclude add thank remarks": r"^conclusion(?:s)?\b|^summary\b|^final thought(?:s)\b|^result(?:s)\b",
77
- }
78
 
79
  sections = {}
80
  matches = []
@@ -96,66 +93,7 @@ def extract_sections_from_pdf_old(pdf_path):
96
 
97
  return sections,section_patterns
98
 
99
- # Define heading regex patterns
100
- SECTION_LABELS = {
101
- "abstract": r"\babstract\b",
102
- "introduction": r"\bintroduction\b",
103
- "methodology": r"\b(method(?:ology)?|approach|model architecture|implementation|framework|experimental setup)\b",
104
- "conclusion": r"\bconclusion(?:s)?\b|\bsummary\b|final thoughts\b|result(?:s)?",
105
- }
106
 
107
- def is_heading(span):
108
- """Heuristic: if text is bold or font size is large, consider it heading"""
109
- return span['size'] > 11 and span['font'].lower().find("bold") != -1
110
-
111
- def clean_text(text):
112
- return re.sub(r'\s+', ' ', text.strip())
113
-
114
- def extract_sections_from_pdf(pdf_path):
115
- doc = fitz.open(pdf_path)
116
-
117
- headings = []
118
- paragraphs = []
119
- section_text_map = {}
120
-
121
- # Extract headings and text blocks
122
- for page in doc:
123
- blocks = page.get_text("dict")["blocks"]
124
- for block in blocks:
125
- for line in block.get("lines", []):
126
- for span in line["spans"]:
127
- txt = clean_text(span["text"])
128
- if len(txt) == 0:
129
- continue
130
- if is_heading(span):
131
- headings.append((txt, page.number))
132
- else:
133
- paragraphs.append((txt, page.number))
134
-
135
- # Identify section labels via regex
136
- labeled_headings = []
137
- for txt, page in headings:
138
- for label, pattern in SECTION_LABELS.items():
139
- if re.search(pattern, txt, re.IGNORECASE):
140
- labeled_headings.append((label, txt, page))
141
-
142
- # Sort labeled headings by page number
143
- labeled_headings.sort(key=lambda x: x[2])
144
-
145
- # Slice paragraphs by heading regions
146
- for i, (label, _, start_page) in enumerate(labeled_headings):
147
- end_page = labeled_headings[i + 1][2] if i + 1 < len(labeled_headings) else doc.page_count
148
-
149
- # Filter relevant paragraphs
150
- section_paras = [
151
- p[0] for p in paragraphs if start_page <= p[1] < end_page
152
- ]
153
-
154
- # Limit by 3–5 paragraphs for summarization efficiency
155
- limited_text = "\n".join(section_paras[:5])
156
- section_text_map[label] = limited_text
157
-
158
- return section_text_map,SECTION_LABELS
159
 
160
  def extract_paragraphs(text, max_paragraphs=4):
161
  # Use double newlines if present
@@ -264,11 +202,8 @@ async def tts_edge_line_by_line(script):
264
  print(f"⚠️ Skipping corrupt or empty file: {filename}")
265
  continue
266
 
267
- try:
268
- segment = AudioSegment.from_mp3(filename)
269
- segments.append(segment)
270
- except CouldntDecodeError as e:
271
- print(f"❌ Error decoding {filename}: {e}")
272
 
273
  return segments
274
 
 
23
  generator = pipeline("text-generation",
24
  model="unsloth/gemma-3-1b-it",
25
  device_map='cpu',
26
+ max_new_tokens=350,
27
+ do_sample=True,
28
+ temperature=0.7,)
29
  # Async function to get voices
30
  async def get_english_voices():
31
  voices = await VoicesManager.create()
 
58
  def split_sentences(text):
59
  return re.split(r'(?<=[.!?])\s+', text.strip())
60
 
61
+ def extract_sections_from_pdf(pdf_path):
62
  reader = PdfReader(pdf_path)
63
  full_text = "\n".join([page.extract_text() for page in reader.pages if page.extract_text()])
64
  full_text = re.sub(r'\n+', '\n', full_text)
 
71
  "third section continuing from Overview to methodology and no required to start introductuion between host & guest directly continue in discussion": r"\b(method(?:ology)?|proposed method|approach|model architecture|architecture|experimental setup|network design|implementation details|techniques|framework|learning algorithm|system description)\b",
72
  "fourth and the last section continuing from methodology to conclusion and no required to start introductuion between host & guest directly continue in discussion and this is the end of conversation so conclude add thank remarks": r"\bconclusion(?:s)?\b|\bsummary\b|final thoughts\b|result(?:s)?",
73
  }
74
+
 
 
 
 
 
75
 
76
  sections = {}
77
  matches = []
 
93
 
94
  return sections,section_patterns
95
 
 
 
 
 
 
 
 
96
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
 
98
  def extract_paragraphs(text, max_paragraphs=4):
99
  # Use double newlines if present
 
202
  print(f"⚠️ Skipping corrupt or empty file: {filename}")
203
  continue
204
 
205
+ segment = AudioSegment.from_mp3(filename)
206
+ segments.append(segment)
 
 
 
207
 
208
  return segments
209