Spaces:
Running
Running
app.py
CHANGED
@@ -23,7 +23,9 @@ nest_asyncio.apply()
|
|
23 |
generator = pipeline("text-generation",
|
24 |
model="unsloth/gemma-3-1b-it",
|
25 |
device_map='cpu',
|
26 |
-
max_new_tokens=
|
|
|
|
|
27 |
# Async function to get voices
|
28 |
async def get_english_voices():
|
29 |
voices = await VoicesManager.create()
|
@@ -56,7 +58,7 @@ KEY_TERMS = [
|
|
56 |
def split_sentences(text):
|
57 |
return re.split(r'(?<=[.!?])\s+', text.strip())
|
58 |
|
59 |
-
def
|
60 |
reader = PdfReader(pdf_path)
|
61 |
full_text = "\n".join([page.extract_text() for page in reader.pages if page.extract_text()])
|
62 |
full_text = re.sub(r'\n+', '\n', full_text)
|
@@ -69,12 +71,7 @@ def extract_sections_from_pdf_old(pdf_path):
|
|
69 |
"third section continuing from Overview to methodology and no required to start introductuion between host & guest directly continue in discussion": r"\b(method(?:ology)?|proposed method|approach|model architecture|architecture|experimental setup|network design|implementation details|techniques|framework|learning algorithm|system description)\b",
|
70 |
"fourth and the last section continuing from methodology to conclusion and no required to start introductuion between host & guest directly continue in discussion and this is the end of conversation so conclude add thank remarks": r"\bconclusion(?:s)?\b|\bsummary\b|final thoughts\b|result(?:s)?",
|
71 |
}
|
72 |
-
|
73 |
-
"Start of podcast with first section of paper as abstract": r"^abstract\b",
|
74 |
-
"second section continuing from abstract to overview and no required to start introductuion between host & guest directly continue in discussion": r"^introduction\b|^overview\b",
|
75 |
-
"third section continuing from overview to methodology and no required to start introductuion between host & guest directly continue in discussion": r"^method(?:ology)?\b|^proposed method\b|^approach\b|^model architecture\b|^experimental setup\b|^network design\b",
|
76 |
-
"fourth and the last section continuing from methodology to conclusion and no required to start introductuion between host & guest directly continue in discussion and this is the end of conversation so conclude add thank remarks": r"^conclusion(?:s)?\b|^summary\b|^final thought(?:s)\b|^result(?:s)\b",
|
77 |
-
}
|
78 |
|
79 |
sections = {}
|
80 |
matches = []
|
@@ -96,66 +93,7 @@ def extract_sections_from_pdf_old(pdf_path):
|
|
96 |
|
97 |
return sections,section_patterns
|
98 |
|
99 |
-
# Define heading regex patterns
|
100 |
-
SECTION_LABELS = {
|
101 |
-
"abstract": r"\babstract\b",
|
102 |
-
"introduction": r"\bintroduction\b",
|
103 |
-
"methodology": r"\b(method(?:ology)?|approach|model architecture|implementation|framework|experimental setup)\b",
|
104 |
-
"conclusion": r"\bconclusion(?:s)?\b|\bsummary\b|final thoughts\b|result(?:s)?",
|
105 |
-
}
|
106 |
|
107 |
-
def is_heading(span):
|
108 |
-
"""Heuristic: if text is bold or font size is large, consider it heading"""
|
109 |
-
return span['size'] > 11 and span['font'].lower().find("bold") != -1
|
110 |
-
|
111 |
-
def clean_text(text):
|
112 |
-
return re.sub(r'\s+', ' ', text.strip())
|
113 |
-
|
114 |
-
def extract_sections_from_pdf(pdf_path):
|
115 |
-
doc = fitz.open(pdf_path)
|
116 |
-
|
117 |
-
headings = []
|
118 |
-
paragraphs = []
|
119 |
-
section_text_map = {}
|
120 |
-
|
121 |
-
# Extract headings and text blocks
|
122 |
-
for page in doc:
|
123 |
-
blocks = page.get_text("dict")["blocks"]
|
124 |
-
for block in blocks:
|
125 |
-
for line in block.get("lines", []):
|
126 |
-
for span in line["spans"]:
|
127 |
-
txt = clean_text(span["text"])
|
128 |
-
if len(txt) == 0:
|
129 |
-
continue
|
130 |
-
if is_heading(span):
|
131 |
-
headings.append((txt, page.number))
|
132 |
-
else:
|
133 |
-
paragraphs.append((txt, page.number))
|
134 |
-
|
135 |
-
# Identify section labels via regex
|
136 |
-
labeled_headings = []
|
137 |
-
for txt, page in headings:
|
138 |
-
for label, pattern in SECTION_LABELS.items():
|
139 |
-
if re.search(pattern, txt, re.IGNORECASE):
|
140 |
-
labeled_headings.append((label, txt, page))
|
141 |
-
|
142 |
-
# Sort labeled headings by page number
|
143 |
-
labeled_headings.sort(key=lambda x: x[2])
|
144 |
-
|
145 |
-
# Slice paragraphs by heading regions
|
146 |
-
for i, (label, _, start_page) in enumerate(labeled_headings):
|
147 |
-
end_page = labeled_headings[i + 1][2] if i + 1 < len(labeled_headings) else doc.page_count
|
148 |
-
|
149 |
-
# Filter relevant paragraphs
|
150 |
-
section_paras = [
|
151 |
-
p[0] for p in paragraphs if start_page <= p[1] < end_page
|
152 |
-
]
|
153 |
-
|
154 |
-
# Limit by 3–5 paragraphs for summarization efficiency
|
155 |
-
limited_text = "\n".join(section_paras[:5])
|
156 |
-
section_text_map[label] = limited_text
|
157 |
-
|
158 |
-
return section_text_map,SECTION_LABELS
|
159 |
|
160 |
def extract_paragraphs(text, max_paragraphs=4):
|
161 |
# Use double newlines if present
|
@@ -264,11 +202,8 @@ async def tts_edge_line_by_line(script):
|
|
264 |
print(f"⚠️ Skipping corrupt or empty file: {filename}")
|
265 |
continue
|
266 |
|
267 |
-
|
268 |
-
|
269 |
-
segments.append(segment)
|
270 |
-
except CouldntDecodeError as e:
|
271 |
-
print(f"❌ Error decoding {filename}: {e}")
|
272 |
|
273 |
return segments
|
274 |
|
|
|
23 |
generator = pipeline("text-generation",
|
24 |
model="unsloth/gemma-3-1b-it",
|
25 |
device_map='cpu',
|
26 |
+
max_new_tokens=350,
|
27 |
+
do_sample=True,
|
28 |
+
temperature=0.7,)
|
29 |
# Async function to get voices
|
30 |
async def get_english_voices():
|
31 |
voices = await VoicesManager.create()
|
|
|
58 |
def split_sentences(text):
|
59 |
return re.split(r'(?<=[.!?])\s+', text.strip())
|
60 |
|
61 |
+
def extract_sections_from_pdf(pdf_path):
|
62 |
reader = PdfReader(pdf_path)
|
63 |
full_text = "\n".join([page.extract_text() for page in reader.pages if page.extract_text()])
|
64 |
full_text = re.sub(r'\n+', '\n', full_text)
|
|
|
71 |
"third section continuing from Overview to methodology and no required to start introductuion between host & guest directly continue in discussion": r"\b(method(?:ology)?|proposed method|approach|model architecture|architecture|experimental setup|network design|implementation details|techniques|framework|learning algorithm|system description)\b",
|
72 |
"fourth and the last section continuing from methodology to conclusion and no required to start introductuion between host & guest directly continue in discussion and this is the end of conversation so conclude add thank remarks": r"\bconclusion(?:s)?\b|\bsummary\b|final thoughts\b|result(?:s)?",
|
73 |
}
|
74 |
+
|
|
|
|
|
|
|
|
|
|
|
75 |
|
76 |
sections = {}
|
77 |
matches = []
|
|
|
93 |
|
94 |
return sections,section_patterns
|
95 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
96 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
97 |
|
98 |
def extract_paragraphs(text, max_paragraphs=4):
|
99 |
# Use double newlines if present
|
|
|
202 |
print(f"⚠️ Skipping corrupt or empty file: {filename}")
|
203 |
continue
|
204 |
|
205 |
+
segment = AudioSegment.from_mp3(filename)
|
206 |
+
segments.append(segment)
|
|
|
|
|
|
|
207 |
|
208 |
return segments
|
209 |
|