Spaces:
Runtime error
Runtime error
import requests | |
from bs4 import BeautifulSoup | |
from nltk.tokenize import sent_tokenize | |
import nltk | |
import re | |
import streamlit as st | |
from youtube_transcript_api import YouTubeTranscriptApi | |
import spacy | |
def fetch_article_text(url: str): | |
r = requests.get(url) | |
soup = BeautifulSoup(r.text, "html.parser") | |
results = soup.find_all(["h1", "p"]) | |
text = [result.text for result in results] | |
ARTICLE = " ".join(text) | |
return re.sub(r'\[\d+\]', '', ARTICLE) | |
def count_tokens(text: str): | |
return len(text.split(" ")) | |
def get_text_from_youtube_url(url: str): | |
id = url.split("=")[1] | |
try: | |
transcript = YouTubeTranscriptApi.get_transcript(id) | |
except: | |
transcript = YouTubeTranscriptApi.find_transcript(["en"]) | |
script = "" | |
for text in transcript: | |
t = text["text"] | |
if t != '[Music]': | |
script += t.lower() + " " | |
return add_punctuation(script) | |
def add_punctuation(text: str): | |
# try: | |
nlp = spacy.load("en_core_web_sm") | |
# except: | |
# import spacy.cli | |
# spacy.cli.download("en_core_web_sm") | |
# nlp = spacy.load("en_core_web_sm") | |
doc = nlp(text) | |
punctuation = [".", ",", ";", ":", "?", "!"] | |
sentences = [] | |
for sentence in doc.sents: | |
last_token = sentence[-1] | |
if last_token.text in punctuation: | |
sentence = sentence[:-1] | |
last_word = sentence[-1] | |
if last_word.pos_ == "NOUN": | |
sentence = sentence.text + "." | |
elif last_word.pos_ == "VERB": | |
sentence = sentence.text + "?" | |
else: | |
sentence = sentence.text + "." | |
sentence = sentence[0].upper() + sentence[1:] | |
sentences.append(sentence) | |
text_with_punctuation = " ".join(sentences) | |
return text_with_punctuation | |
def get_input_chunks(text: str, max_length: int = 500): | |
text = re.sub(r'\[\d+\]', '', text) | |
try: | |
sentences = sent_tokenize(text) | |
except: | |
nltk.download('punkt') | |
sentences = sent_tokenize(text) | |
sentences = [sentence for sentence in sentences if len(sentence.strip()) > 0 and count_tokens(sentence) > 4] | |
input_chunks = [] | |
temp_sentences = "" | |
tokens = 0 | |
for sentence in sentences: | |
if tokens + count_tokens(sentence) < max_length: | |
temp_sentences += sentence | |
tokens += count_tokens(sentence) | |
else: | |
input_chunks.append(temp_sentences) | |
tokens = count_tokens(sentence) | |
temp_sentences = sentence | |
if len(temp_sentences) > 0: | |
input_chunks.append(temp_sentences) | |
return input_chunks | |