Spaces:
Runtime error
Runtime error
bugs
Browse files- app.py +28 -16
- metrics.py +60 -0
app.py
CHANGED
@@ -5,43 +5,55 @@ import re
|
|
5 |
# https://stackoverflow.com/questions/22800401/how-to-capitalize-the-first-letter-of-every-sentence
|
6 |
def cap(match):
|
7 |
return(match.group().capitalize())
|
8 |
-
|
9 |
-
|
10 |
-
def predict(brakes, transcript):
|
11 |
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
# preprocess the text by removing filler words
|
13 |
# Define a list of filler words to remove
|
14 |
filler_words = ["um", "uh", "hmm", "ha", "er", "ah", "yeah"]
|
15 |
-
words =
|
16 |
clean_words = [word for word in words if word.lower() not in filler_words]
|
17 |
-
|
|
|
|
|
|
|
18 |
# Define a regular expression pattern that matches any filler word surrounded by whitespace or punctuation
|
19 |
#pattern = r"(?<=\s|\b)(" + "|".join(fillers) + r")(?=\s|\b)"
|
20 |
# Use re.sub to replace the filler words with empty strings
|
21 |
-
#clean_input_text = re.sub(pattern, "", input_text)
|
|
|
|
|
22 |
|
|
|
23 |
# Do the punctuation restauration
|
24 |
model = PunctuationModel()
|
25 |
output_text = model.restore_punctuation(input_text)
|
26 |
-
|
27 |
-
srt_file = input_text
|
28 |
-
punctuated = output_text
|
29 |
|
30 |
# if any of the line brake methods are implemented,
|
31 |
# return the text as a single line
|
32 |
pcnt_file_cr = output_text
|
33 |
|
34 |
if 'textlines' in brakes:
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
|
|
|
|
|
|
|
|
|
|
41 |
# goal: restore the break points i.e. the same number of lines as the srt file
|
42 |
# this is necessary, because each line in the srt file corresponds to a frame from the video
|
43 |
if len(srt_file_array)!=len(pcnt_file_array):
|
44 |
return "AssertError: The length of the transcript and the punctuated file should be the same: ",len(srt_file_array),len(pcnt_file_array)
|
|
|
45 |
pcnt_file_array_hash = []
|
46 |
for idx, item in enumerate(srt_file_array):
|
47 |
if item.endswith('#'):
|
@@ -73,7 +85,7 @@ Model restores punctuation and case i.e. of the following punctuations -- [! ? .
|
|
73 |
examples = [['sentences', "my name is clara i live in berkeley california"]]
|
74 |
|
75 |
interface = gr.Interface(fn = predict,
|
76 |
-
inputs = [gr.Radio(["no
|
77 |
"text"],
|
78 |
outputs = ["text"],
|
79 |
title = title,
|
|
|
5 |
# https://stackoverflow.com/questions/22800401/how-to-capitalize-the-first-letter-of-every-sentence
|
6 |
def cap(match):
|
7 |
return(match.group().capitalize())
|
|
|
|
|
|
|
8 |
|
9 |
+
def remove_filler_words(transcript):
|
10 |
+
|
11 |
+
# preserve line brakes
|
12 |
+
transcript_hash = " # ".join(transcript.strip().splitlines())
|
13 |
+
print('transcript_hash')
|
14 |
+
print(transcript_hash)
|
15 |
# preprocess the text by removing filler words
|
16 |
# Define a list of filler words to remove
|
17 |
filler_words = ["um", "uh", "hmm", "ha", "er", "ah", "yeah"]
|
18 |
+
words = transcript_hash.split()
|
19 |
clean_words = [word for word in words if word.lower() not in filler_words]
|
20 |
+
input_text_clean = ' '.join(clean_words)
|
21 |
+
# restore the line brakes
|
22 |
+
input_text= input_text_clean.replace(' # ','\n')
|
23 |
+
return input_text
|
24 |
# Define a regular expression pattern that matches any filler word surrounded by whitespace or punctuation
|
25 |
#pattern = r"(?<=\s|\b)(" + "|".join(fillers) + r")(?=\s|\b)"
|
26 |
# Use re.sub to replace the filler words with empty strings
|
27 |
+
#clean_input_text = re.sub(pattern, "", input_text)
|
28 |
+
|
29 |
+
def predict(brakes, transcript):
|
30 |
|
31 |
+
input_text = remove_filler_words(transcript)
|
32 |
# Do the punctuation restauration
|
33 |
model = PunctuationModel()
|
34 |
output_text = model.restore_punctuation(input_text)
|
|
|
|
|
|
|
35 |
|
36 |
# if any of the line brake methods are implemented,
|
37 |
# return the text as a single line
|
38 |
pcnt_file_cr = output_text
|
39 |
|
40 |
if 'textlines' in brakes:
|
41 |
+
|
42 |
+
# preserve line brakes
|
43 |
+
srt_file_hash = '# '.join(input_text.strip().splitlines())
|
44 |
+
#srt_file_sub=re.sub('\s*\n\s*','# ',srt_file_strip)
|
45 |
+
srt_file_array=srt_file_hash.split()
|
46 |
+
pcnt_file_array=output_text.split()
|
47 |
+
|
48 |
+
print('pcnt_file_array')
|
49 |
+
print(pcnt_file_array)
|
50 |
+
print('srt_file_array')
|
51 |
+
print(srt_file_array)
|
52 |
# goal: restore the break points i.e. the same number of lines as the srt file
|
53 |
# this is necessary, because each line in the srt file corresponds to a frame from the video
|
54 |
if len(srt_file_array)!=len(pcnt_file_array):
|
55 |
return "AssertError: The length of the transcript and the punctuated file should be the same: ",len(srt_file_array),len(pcnt_file_array)
|
56 |
+
|
57 |
pcnt_file_array_hash = []
|
58 |
for idx, item in enumerate(srt_file_array):
|
59 |
if item.endswith('#'):
|
|
|
85 |
examples = [['sentences', "my name is clara i live in berkeley california"]]
|
86 |
|
87 |
interface = gr.Interface(fn = predict,
|
88 |
+
inputs = [gr.Radio(["no brakes","sentences", "textlines"], value="no brakes", label="line brakes"),
|
89 |
"text"],
|
90 |
outputs = ["text"],
|
91 |
title = title,
|
metrics.py
ADDED
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Import nltk library for natural language processing
|
2 |
+
import nltk
|
3 |
+
|
4 |
+
from transformers import AutoTokenizer
|
5 |
+
|
6 |
+
# Define a function that takes some text as input and returns the number of tokens
|
7 |
+
def token_count(text):
|
8 |
+
# Import the Encoder class from bpe
|
9 |
+
from bpe import Encoder
|
10 |
+
# Create an encoder object with a vocabulary size of 10
|
11 |
+
encoder = Encoder(vocab_size=14735746)
|
12 |
+
|
13 |
+
# Train the encoder on the text
|
14 |
+
encoder.fit(text.split())
|
15 |
+
|
16 |
+
# Encode the text into tokens
|
17 |
+
tokens = encoder.tokenize(text)
|
18 |
+
|
19 |
+
# Return the number of tokens
|
20 |
+
return tokens
|
21 |
+
|
22 |
+
def num_tokens(text):
|
23 |
+
|
24 |
+
tokenizer = AutoTokenizer.from_pretrained("gpt2")
|
25 |
+
|
26 |
+
token_ids = tokenizer.encode(text)
|
27 |
+
|
28 |
+
token_size = len(token_ids)
|
29 |
+
|
30 |
+
return token_size
|
31 |
+
|
32 |
+
def num_words(text):
|
33 |
+
sentences = nltk.sent_tokenize(text)
|
34 |
+
# Tokenize each sentence into words using nltk.word_tokenize()
|
35 |
+
words = []
|
36 |
+
for sentence in sentences:
|
37 |
+
words.extend(nltk.word_tokenize(sentence))
|
38 |
+
|
39 |
+
num_words = len(words)
|
40 |
+
|
41 |
+
return num_words
|
42 |
+
|
43 |
+
def num_sentences(text):
|
44 |
+
# Tokenize the text into sentences using nltk.sent_tokenize()
|
45 |
+
sentences = nltk.sent_tokenize(text)
|
46 |
+
num_sentences = len(sentences)
|
47 |
+
return num_sentences
|
48 |
+
|
49 |
+
|
50 |
+
def num_chars(text):
|
51 |
+
num_characters = len(text)
|
52 |
+
return num_characters
|
53 |
+
|
54 |
+
|
55 |
+
# Print out the results
|
56 |
+
# print(f"Number of sentences: {num_sentences}")
|
57 |
+
# print(f"Number of words: {num_words}")
|
58 |
+
# print(f"Number of tokens: {num_tokens}")
|
59 |
+
# print(f"Number of trans_tokens: {trans_tokens}")
|
60 |
+
# print(f"Number of characters: {num_characters}")
|