Spaces:

camparchimedes
/

nb

Build error

App Files Files

camparchimedes commited on Aug 28, 2024

Commit

04f2c63

verified ·

1 Parent(s): 3a0e2ab

Update app.py

Browse files

Files changed (1) hide show

app.py +13 -188

app.py CHANGED Viewed

@@ -37,17 +37,18 @@ from fpdf import FPDF
 import psutil
 from gpuinfo import GPUInfo
-import numpy as np
 import torch
-import torchaudio
-import torchaudio.transforms as transforms
-from transformers import pipeline, AutoModel
-import spacy
-import networkx as nx
-from sklearn.feature_extraction.text import TfidfVectorizer
-from sklearn.metrics.pairwise import cosine_similarity
 warnings.filterwarnings("ignore")
 # ------------header section------------
@@ -107,124 +108,6 @@ def transcribe(microphone, file_upload, batch_size=15):
     return warn_output + text.strip(), system_info
-# ------------summary section------------
-# ------------for app integration later------------
-nlp = spacy.blank("nb")  # codename 'nb' = Norwegian Bokmål
-nlp.add_pipe('sentencizer')
-spacy_stop_words = spacy.lang.nb.stop_words.STOP_WORDS
-summarization_model = AutoModel.from_pretrained("NbAiLab/nb-bert-large")
-# pipe = pipeline("fill-mask", model="NbAiLab/nb-bert-large")
-@spaces.GPU()
-def clean_text(text):
-    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text)
-    text = re.sub(r'[^\w\s]', '', text)
-    text = re.sub(r'\s+', ' ', text).strip()
-    return text
-@spaces.GPU()
-def preprocess_text(text, file_upload):
-    if (text is not None) and (file_upload is None):
-        doc = nlp(text)
-    elif (text is None) and (file_upload is not None):
-        doc = nlp(file_upload)
-    stop_words = spacy_stop_words
-    words = [token.text for token in doc if token.text.lower() not in stop_words]
-    return ' '.join(words)
-@spaces.GPU()
-def summarize_text(text, file_upload):
-    #
-    # ----add same if/elif logic as above here----
-    #
-    preprocessed_text = preprocess_text(text, file_upload)
-    inputs = summarization_model(preprocessed_text, max_length=1024, return_tensors="pt", truncation=True)
-    inputs = inputs.to(device)
-    summary_ids = summarization_model.generate(inputs.input_ids, num_beams=5, max_length=150, early_stopping=True)
-    return summarization_model.decode(summary_ids[0], skip_special_tokens=True)
-@spaces.GPU()
-def build_similarity_matrix(sentences):
-    similarity_matrix = nx.Graph()
-    for i, tokens_a in enumerate(sentences):
-        for j, tokens_b in enumerate(sentences):
-            if i != j:
-                common_words = set(tokens_a) & set(tokens_b)
-                similarity_matrix.add_edge(i, j, weight=len(common_words))
-    return similarity_matrix
-# PageRank
-@spaces.GPU()
-def graph_based_summary(text, file_upload, num_paragraphs=3):
-    #
-    # ----add same if/elif logic as above here----
-    #
-    sentences = [sent.text for sent in doc.sents]
-    if len(sentences) < num_paragraphs:
-        return ' '.join(sentences)
-    sentence_tokens = [nlp(sent) for sent in sentences]
-    stop_words = spacy_stop_words
-    filtered_tokens = [[token.text for token in tokens if token.text.lower() not in stop_words] for tokens in sentence_tokens]
-    similarity_matrix = build_similarity_matrix(filtered_tokens)
-    scores = nx.pagerank(similarity_matrix)
-    ranked_sentences = sorted(((scores[i], sent) for i, sent in enumerate(sentences)), reverse=True)
-    return ' '.join([sent for _, sent in ranked_sentences[:num_paragraphs]])
-@spaces.GPU()
-def lex_rank_summary(text, file_upload, num_paragraphs=3, threshold=0.1):
-    if (text is not None) and (file_upload is None):
-        doc = nlp(text)
-    elif (text is None) and (file_upload is not None):
-        doc = nlp(file_upload)
-    sentences = [sent.text for sent in doc.sents]
-    if len(sentences) < num_paragraphs:
-        return ' '.join(sentences)
-    stop_words = spacy_stop_words
-    vectorizer = TfidfVectorizer(stop_words=list(stop_words))
-    X = vectorizer.fit_transform(sentences)
-    similarity_matrix = cosine_similarity(X, X)
-    # Apply threshold@similarity matrix
-    similarity_matrix[similarity_matrix < threshold] = 0
-    nx_graph = nx.from_numpy_array(similarity_matrix)
-    scores = nx.pagerank(nx_graph)
-    ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
-    return ' '.join([ranked_sentences[i][1] for i in range(num_paragraphs)])
-@spaces.GPU()
-def text_rank_summary(text, file_upload, num_paragraphs=3):
-    if (text is not None) and (file_upload is not None):
-        doc = nlp(text)
-    elif (text is None) and (file_upload is not None):
-        doc = nlp(file_upload)
-    sentences = [sent.text for sent in doc.sents]
-    if len(sentences) < num_paragraphs:
-        return ' '.join(sentences)
-    stop_words = spacy_stop_words
-    vectorizer = TfidfVectorizer(stop_words=list(stop_words))
-    X = vectorizer.fit_transform(sentences)
-    similarity_matrix = cosine_similarity(X, X)
-    nx_graph = nx.from_numpy_array(similarity_matrix)
-    scores = nx.pagerank(nx_graph)
-    ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
-    return ' '.join([ranked_sentences[i][1] for i in range(num_paragraphs)])
 def save_to_pdf(text, summary):
     pdf = FPDF()
@@ -245,6 +128,7 @@ def save_to_pdf(text, summary):
     pdf.output(pdf_output_path)
     return pdf_output_path
 iface = gr.Blocks()
 with iface:
@@ -262,71 +146,12 @@ with iface:
         transcribe_btn = gr.Button("Transcribe Interview")
         text_output = gr.Textbox()
         system_info = gr.Textbox(label="System Info")
-        # Corrected the order of arguments here to prevent the SyntaxError
         transcribe_btn.click(fn=transcribe, inputs=[microphone, upload], outputs=[text_output, system_info])
     with gr.Tabs():
-        with gr.TabItem("Summary | PageRank"):
-            text_input_graph = gr.Textbox(label="Input Text", placeholder="txt2summarize")
-            summary_output_graph = gr.Textbox(label="PageRank | token-based similarity")
-            gr.Markdown("""
-            **token-based**: similarity matrix edge weights representing token overlap/
-            ranked by their centrality in the graph (good with dense inter-sentence relationships)
-            """)
-            gr.Markdown("""
-            *Bjørn*: **gir sammendrag som fanger opp de mest relevante setninger i teksten**
-            """)
-            summarize_transcribed_button_graph = gr.Button("Summary of Transcribed Text, Click Here")
-            summarize_transcribed_button_graph.click(fn=lambda text: graph_based_summary(text, None), inputs=[text_input_graph], outputs=[summary_output_graph])
-            summarize_uploaded_button_graph = gr.Button("Upload Text to Summarize, Click Here")
-            summarize_uploaded_button_graph.click(fn=graph_based_summary, inputs=[None, upload], outputs=[summary_output_graph])
-        with gr.TabItem("Summary | LexRank"):
-            text_output = gr.Textbox(label="Transcription Output")
-            text_input_lex = gr.Textbox(label="Input Text", placeholder="txt2summarize")
-            summary_output_lex = gr.Textbox(label="LexRank | cosine similarity")
-            gr.Markdown("""
-            **semantic**: TF-IDF vectorization@cosine similarity matrix, ranked by eigenvector centrality.
-            (good for sparse graph structures with thresholding)
-            """)
-            gr.Markdown("""
-            *Bjørn*: **gir sammendrag som best fanger opp betydningen av hele teksten**
-            """)
-            summarize_transcribed_button_lex = gr.Button("Summary of Transcribed Text, Click Here")
-            summarize_transcribed_button_lex.click(fn=lambda text: lex_rank_summary(text, None), inputs=[text_input_lex], outputs=[summary_output_lex])
-            summarize_uploaded_button_lex = gr.Button("Upload Text to Summarize, Click Here")
-            summarize_uploaded_button_lex.click(fn=lex_rank_summary, inputs=[None, upload], outputs=[summary_output_lex])
-        with gr.TabItem("Summary | TextRank"):
-            text_input_text_rank = gr.Textbox(label="Input Text", placeholder="txt2summarize")
-            summary_output_text_rank = gr.Textbox(label="TextRank | lexical similarity")
-            gr.Markdown("""
-            **sentence**: graph with weighted edges based on lexical similarity. (i.e" "sentence similarity"word overlap)/sentence similarity
-            """)
-            gr.Markdown("""
-            *Bjørn*: **sammendrag basert på i de setningene som ligner mest på hverandre fra teksten**
-            """)
-            summarize_transcribed_button_text_rank = gr.Button("Summary of Transcribed Text, Click Here")
-            summarize_transcribed_button_text_rank.click(fn=lambda text: text_rank_summary(text, None), inputs=[text_input_text_rank], outputs=[summary_output_text_rank])
-            summarize_uploaded_button_text_rank = gr.Button("Upload Text to Summarize, Click Here")
-            summarize_uploaded_button_text_rank.click(fn=text_rank_summary, inputs=[None, upload], outputs=[summary_output_text_rank])
         with gr.TabItem("Download PDF"):
-            pdf_text_only = gr.Button("Download PDF with Transcribed Text Only")
-            pdf_summary_only = gr.Button("Download PDF with Summary-of-Transcribed-Text Only")
-            pdf_both = gr.Button("Download PDF with Both")
             pdf_output = gr.File(label="Download PDF")
-            pdf_text_only.click(fn=lambda text: save_to_pdf(text, ""), inputs=[text_output], outputs=[pdf_output])
-            pdf_summary_only.click(fn=lambda summary: save_to_pdf("", summary), inputs=[summary_output_graph, summary_output_lex, summary_output_text_rank], outputs=[pdf_output])  # Includes all summary outputs
-            pdf_both.click(fn=lambda text, summary: save_to_pdf(text, summary), inputs=[text_output, summary_output_graph], outputs=[pdf_output])

 import psutil
 from gpuinfo import GPUInfo
+#import numpy as np
 import torch
+#import torchaudio
+#import torchaudio.transforms as transforms
+from transformers import pipeline #AutoModel
+#import spacy
+#import networkx as nx
+#from sklearn.feature_extraction.text import TfidfVectorizer
+#from sklearn.metrics.pairwise import cosine_similarity
 warnings.filterwarnings("ignore")
 # ------------header section------------
     return warn_output + text.strip(), system_info
 def save_to_pdf(text, summary):
     pdf = FPDF()
     pdf.output(pdf_output_path)
     return pdf_output_path
 iface = gr.Blocks()
 with iface:
         transcribe_btn = gr.Button("Transcribe Interview")
         text_output = gr.Textbox()
         system_info = gr.Textbox(label="System Info")
         transcribe_btn.click(fn=transcribe, inputs=[microphone, upload], outputs=[text_output, system_info])
     with gr.Tabs():
         with gr.TabItem("Download PDF"):
+            pdf_text_only = gr.Button("Download PDF with Transcribed Text")
             pdf_output = gr.File(label="Download PDF")
+            pdf_text_only.click(fn=lambda text: save_to_pdf(text, ""), inputs=[text_output], outputs=[pdf_output])