wldmr commited on
Commit
25e3dec
·
1 Parent(s): ce2a6bf
Files changed (2) hide show
  1. app.py +28 -16
  2. metrics.py +60 -0
app.py CHANGED
@@ -5,43 +5,55 @@ import re
5
  # https://stackoverflow.com/questions/22800401/how-to-capitalize-the-first-letter-of-every-sentence
6
  def cap(match):
7
  return(match.group().capitalize())
8
-
9
-
10
- def predict(brakes, transcript):
11
 
 
 
 
 
 
 
12
  # preprocess the text by removing filler words
13
  # Define a list of filler words to remove
14
  filler_words = ["um", "uh", "hmm", "ha", "er", "ah", "yeah"]
15
- words = transcript.split()
16
  clean_words = [word for word in words if word.lower() not in filler_words]
17
- input_text = " ".join(clean_words)
 
 
 
18
  # Define a regular expression pattern that matches any filler word surrounded by whitespace or punctuation
19
  #pattern = r"(?<=\s|\b)(" + "|".join(fillers) + r")(?=\s|\b)"
20
  # Use re.sub to replace the filler words with empty strings
21
- #clean_input_text = re.sub(pattern, "", input_text)
 
 
22
 
 
23
  # Do the punctuation restauration
24
  model = PunctuationModel()
25
  output_text = model.restore_punctuation(input_text)
26
-
27
- srt_file = input_text
28
- punctuated = output_text
29
 
30
  # if any of the line brake methods are implemented,
31
  # return the text as a single line
32
  pcnt_file_cr = output_text
33
 
34
  if 'textlines' in brakes:
35
- # restore the carrige returns
36
- srt_file_strip=srt_file.strip()
37
- srt_file_sub=re.sub('\s*\n\s*','# ',srt_file_strip)
38
- srt_file_array=srt_file_sub.split(' ')
39
- pcnt_file_array=punctuated.split(' ')
40
-
 
 
 
 
 
41
  # goal: restore the break points i.e. the same number of lines as the srt file
42
  # this is necessary, because each line in the srt file corresponds to a frame from the video
43
  if len(srt_file_array)!=len(pcnt_file_array):
44
  return "AssertError: The length of the transcript and the punctuated file should be the same: ",len(srt_file_array),len(pcnt_file_array)
 
45
  pcnt_file_array_hash = []
46
  for idx, item in enumerate(srt_file_array):
47
  if item.endswith('#'):
@@ -73,7 +85,7 @@ Model restores punctuation and case i.e. of the following punctuations -- [! ? .
73
  examples = [['sentences', "my name is clara i live in berkeley california"]]
74
 
75
  interface = gr.Interface(fn = predict,
76
- inputs = [gr.Radio(["no breaks","sentences", "textlines"], default="no breaks", label="line brakes"),
77
  "text"],
78
  outputs = ["text"],
79
  title = title,
 
5
  # https://stackoverflow.com/questions/22800401/how-to-capitalize-the-first-letter-of-every-sentence
6
  def cap(match):
7
  return(match.group().capitalize())
 
 
 
8
 
9
+ def remove_filler_words(transcript):
10
+
11
+ # preserve line brakes
12
+ transcript_hash = " # ".join(transcript.strip().splitlines())
13
+ print('transcript_hash')
14
+ print(transcript_hash)
15
  # preprocess the text by removing filler words
16
  # Define a list of filler words to remove
17
  filler_words = ["um", "uh", "hmm", "ha", "er", "ah", "yeah"]
18
+ words = transcript_hash.split()
19
  clean_words = [word for word in words if word.lower() not in filler_words]
20
+ input_text_clean = ' '.join(clean_words)
21
+ # restore the line brakes
22
+ input_text= input_text_clean.replace(' # ','\n')
23
+ return input_text
24
  # Define a regular expression pattern that matches any filler word surrounded by whitespace or punctuation
25
  #pattern = r"(?<=\s|\b)(" + "|".join(fillers) + r")(?=\s|\b)"
26
  # Use re.sub to replace the filler words with empty strings
27
+ #clean_input_text = re.sub(pattern, "", input_text)
28
+
29
+ def predict(brakes, transcript):
30
 
31
+ input_text = remove_filler_words(transcript)
32
  # Do the punctuation restauration
33
  model = PunctuationModel()
34
  output_text = model.restore_punctuation(input_text)
 
 
 
35
 
36
  # if any of the line brake methods are implemented,
37
  # return the text as a single line
38
  pcnt_file_cr = output_text
39
 
40
  if 'textlines' in brakes:
41
+
42
+ # preserve line brakes
43
+ srt_file_hash = '# '.join(input_text.strip().splitlines())
44
+ #srt_file_sub=re.sub('\s*\n\s*','# ',srt_file_strip)
45
+ srt_file_array=srt_file_hash.split()
46
+ pcnt_file_array=output_text.split()
47
+
48
+ print('pcnt_file_array')
49
+ print(pcnt_file_array)
50
+ print('srt_file_array')
51
+ print(srt_file_array)
52
  # goal: restore the break points i.e. the same number of lines as the srt file
53
  # this is necessary, because each line in the srt file corresponds to a frame from the video
54
  if len(srt_file_array)!=len(pcnt_file_array):
55
  return "AssertError: The length of the transcript and the punctuated file should be the same: ",len(srt_file_array),len(pcnt_file_array)
56
+
57
  pcnt_file_array_hash = []
58
  for idx, item in enumerate(srt_file_array):
59
  if item.endswith('#'):
 
85
  examples = [['sentences', "my name is clara i live in berkeley california"]]
86
 
87
  interface = gr.Interface(fn = predict,
88
+ inputs = [gr.Radio(["no brakes","sentences", "textlines"], value="no brakes", label="line brakes"),
89
  "text"],
90
  outputs = ["text"],
91
  title = title,
metrics.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Import nltk library for natural language processing
2
+ import nltk
3
+
4
+ from transformers import AutoTokenizer
5
+
6
+ # Define a function that takes some text as input and returns the number of tokens
7
+ def token_count(text):
8
+ # Import the Encoder class from bpe
9
+ from bpe import Encoder
10
+ # Create an encoder object with a vocabulary size of 10
11
+ encoder = Encoder(vocab_size=14735746)
12
+
13
+ # Train the encoder on the text
14
+ encoder.fit(text.split())
15
+
16
+ # Encode the text into tokens
17
+ tokens = encoder.tokenize(text)
18
+
19
+ # Return the number of tokens
20
+ return tokens
21
+
22
+ def num_tokens(text):
23
+
24
+ tokenizer = AutoTokenizer.from_pretrained("gpt2")
25
+
26
+ token_ids = tokenizer.encode(text)
27
+
28
+ token_size = len(token_ids)
29
+
30
+ return token_size
31
+
32
+ def num_words(text):
33
+ sentences = nltk.sent_tokenize(text)
34
+ # Tokenize each sentence into words using nltk.word_tokenize()
35
+ words = []
36
+ for sentence in sentences:
37
+ words.extend(nltk.word_tokenize(sentence))
38
+
39
+ num_words = len(words)
40
+
41
+ return num_words
42
+
43
+ def num_sentences(text):
44
+ # Tokenize the text into sentences using nltk.sent_tokenize()
45
+ sentences = nltk.sent_tokenize(text)
46
+ num_sentences = len(sentences)
47
+ return num_sentences
48
+
49
+
50
+ def num_chars(text):
51
+ num_characters = len(text)
52
+ return num_characters
53
+
54
+
55
+ # Print out the results
56
+ # print(f"Number of sentences: {num_sentences}")
57
+ # print(f"Number of words: {num_words}")
58
+ # print(f"Number of tokens: {num_tokens}")
59
+ # print(f"Number of trans_tokens: {trans_tokens}")
60
+ # print(f"Number of characters: {num_characters}")