Spaces:

wing-nus
/

SciAssist

Running

App Files Files Community

pr2

#13

by dyxohjl666 - opened Oct 23, 2023

base: refs/heads/main

←

from: refs/pr/13

Discussion Files changed

+60

-185

Files changed (5) hide show

README.md +1 -1
app.py +37 -46
controlled_summarization.py +20 -126
description.py +1 -9
requirements.txt +1 -3

README.md CHANGED Viewed

@@ -4,7 +4,7 @@ emoji: 🚀
 colorFrom: red
 colorTo: red
 sdk: gradio
-sdk_version: 3.50.2
 app_file: app.py
 pinned: false
 license: afl-3.0

 colorFrom: red
 colorTo: red
 sdk: gradio
+sdk_version: 3.21.0
 app_file: app.py
 pinned: false
 license: afl-3.0

app.py CHANGED Viewed

@@ -5,13 +5,33 @@ from reference_string_parsing import *
 from controlled_summarization import *
 from dataset_extraction import *
-from controlled_summarization import recommended_kw
 import requests
 # Example Usage
-# url = "https://arxiv.org/pdf/2305.14996.pdf"
-# dest_folder = "./examples/"
-# download_pdf(url, dest_folder)
 with gr.Blocks(css="#htext span {white-space: pre-line}") as demo:
@@ -25,67 +45,39 @@ with gr.Blocks(css="#htext span {white-space: pre-line}") as demo:
                 gr.Markdown(ctrlsum_file_md)
                 with gr.Row():
                     with gr.Column():
-                        ctrlsum_url = gr.Textbox(label="PDF URL", max_lines=1)
-                        ctrlsum_file = gr.File(label="Input File")
                         ctrlsum_str = gr.TextArea(label="Input String", max_lines=5)
                         with gr.Column():
-                            gr.Markdown("* Set the length of text used for summarization. Length 0 will exert no control over length.")
                             # ctrlsum_file_beams = gr.Number(label="Number of beams for beam search", value=1, precision=0)
                             # ctrlsum_file_sequences = gr.Number(label="Number of generated summaries", value=1, precision=0)
-                            ctrlsum_file_length = gr.Radio(label="Length", value=0, choices=[0, 50, 100, 200])
-                            kw = gr.Radio(visible=False)
-                            ctrlsum_file_keywords = gr.Textbox(label="Keywords", max_lines=1)
                         with gr.Row():
                             ctrlsum_file_btn = gr.Button("Generate")
                     ctrlsum_file_output = gr.Textbox(
                         elem_id="htext",
                         label="Summary",
                     )
-                ctrlsum_file_examples = gr.Examples(
-                    examples=[["examples/H01-1042_body.txt", 50, "automatic evaluation technique", "", ""],
-                              ["examples/H01-1042.pdf", 0, "automatic evaluation technique", "", ""]],
-                    inputs=[ctrlsum_file, ctrlsum_file_length, ctrlsum_file_keywords, ctrlsum_str, ctrlsum_url
-                            ])
         ctrlsum_file_btn.click(
             fn=ctrlsum_for_file,
-            inputs=[ctrlsum_file, ctrlsum_file_length, ctrlsum_file_keywords, ctrlsum_str, ctrlsum_url],
-            outputs=[ctrlsum_file_output, ctrlsum_str, ctrlsum_file]
         )
         def clear():
-            return None, 0, None, None, gr.Radio(visible=False)
-        def update_url(url):
-            if url in recommended_kw.keys():
-                keywords = recommended_kw[url]
-                if keywords != None:
-                    return None, None, gr.Radio(choices=keywords[:3], label="Recommended Keywords", visible=True,
-                                                interactive=True)
-            return None, None, gr.Radio(visible=False)
-        ctrlsum_file.upload(clear, inputs=None,
-                            outputs=[ctrlsum_str, ctrlsum_file_length, ctrlsum_file_keywords, ctrlsum_url, kw])
-        ctrlsum_url.input(update_url, inputs=ctrlsum_url, outputs=[ctrlsum_str, ctrlsum_file, kw])
-        ctrlsum_str.input(clear, inputs=None,
-                          outputs=[ctrlsum_url, ctrlsum_file_length, ctrlsum_file_keywords, ctrlsum_file, kw])
-        def select_kw(env: gr.SelectData):
-            return env.value
-        kw.select(select_kw, None, ctrlsum_file_keywords)
         # Reference String Parsing
         with gr.TabItem("Reference String Parsing"):
-            gr.Markdown(rsp_title_md)
             with gr.Box():
                 gr.Markdown(rsp_str_md)
                 with gr.Row():
@@ -139,7 +131,6 @@ with gr.Blocks(css="#htext span {white-space: pre-line}") as demo:
         # Dataset Extraction
         with gr.TabItem("Dataset Mentions Extraction"):
-            gr.Markdown(de_title_md)
             with gr.Box():
                 gr.Markdown(de_str_md)
                 with gr.Row():

 from controlled_summarization import *
 from dataset_extraction import *
 import requests
+def download_pdf(url, dest_folder):
+    """
+    Download a PDF from a given URL and save it to a specified destination folder.
+    Parameters:
+        url (str): URL of the PDF
+        dest_folder (str): Destination folder to save the downloaded PDF
+    """
+    if not os.path.exists(dest_folder):
+        os.makedirs(dest_folder)
+    response = requests.get(url, stream=True)
+    filename = os.path.join(dest_folder, url.split("/")[-1])
+    with open(filename, 'wb') as file:
+        for chunk in response.iter_content(chunk_size=1024):
+            if chunk:
+                file.write(chunk)
+    print(f"Downloaded {url} to {filename}")
+    return filename
 # Example Usage
+#url = "https://arxiv.org/pdf/2305.14996.pdf"
+#dest_folder = "./examples/"
+#download_pdf(url, dest_folder)
 with gr.Blocks(css="#htext span {white-space: pre-line}") as demo:
                 gr.Markdown(ctrlsum_file_md)
                 with gr.Row():
                     with gr.Column():
+                        ctrlsum_url = gr.TextArea(label="PDF URL", max_lines=1)
+                        ctrlsum_file = gr.File(label="Input File", max_lines=2)
                         ctrlsum_str = gr.TextArea(label="Input String", max_lines=5)
                         with gr.Column():
+                            gr.Markdown("* Length 0 will exert no control over length.")
                             # ctrlsum_file_beams = gr.Number(label="Number of beams for beam search", value=1, precision=0)
                             # ctrlsum_file_sequences = gr.Number(label="Number of generated summaries", value=1, precision=0)
+                            ctrlsum_file_length = gr.Slider(0,300,step=50, label="Length")
+                            ctrlsum_file_keywords = gr.Textbox(label="Keywords",max_lines=1)
                         with gr.Row():
                             ctrlsum_file_btn = gr.Button("Generate")
                     ctrlsum_file_output = gr.Textbox(
                         elem_id="htext",
                         label="Summary",
                     )
+                ctrlsum_file_examples = gr.Examples(examples=[["examples/H01-1042_body.txt", 50, "automatic evaluation technique"],["examples/H01-1042.pdf", 0, "automatic evaluation technique"]],
+                                                inputs=[ctrlsum_file, ctrlsum_file_length, ctrlsum_file_keywords])
+        if len(ctrlsum_url.value) > 4:
+            ctrlsum_file = download_pdf(ctrlsum_url.value, './cache/')
         ctrlsum_file_btn.click(
             fn=ctrlsum_for_file,
+            inputs=[ctrlsum_file, ctrlsum_file_length, ctrlsum_file_keywords, ctrlsum_str],
+            outputs=[ctrlsum_file_output, ctrlsum_str]
         )
         def clear():
+            return None,0,None
+        ctrlsum_file.change(clear, inputs=None,outputs=[ctrlsum_str,ctrlsum_file_length,ctrlsum_file_keywords])
         # Reference String Parsing
         with gr.TabItem("Reference String Parsing"):
             with gr.Box():
                 gr.Markdown(rsp_str_md)
                 with gr.Row():
         # Dataset Extraction
         with gr.TabItem("Dataset Mentions Extraction"):
             with gr.Box():
                 gr.Markdown(de_str_md)
                 with gr.Row():

controlled_summarization.py CHANGED Viewed

@@ -1,106 +1,22 @@
 from typing import List, Tuple
 import torch
 from SciAssist import Summarization
-import os
-import requests
-from datasets import load_dataset
-print(f"Is CUDA available: {torch.cuda.is_available()}")
-# True
-if torch.cuda.is_available():
-    print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}")
-    device = 'gpu'
-    ctrlsum_pipeline = Summarization(os_name="nt",model_name="flan-t5-xl",checkpoint="dyxohjl666/flant5-xl-cocoscisum",device=device)
-else:
-    device = 'cpu'
-    ctrlsum_pipeline = Summarization(os_name="nt",device=device)
-acl_dict = {}
-recommended_kw = {}
-acl_data = load_dataset("dyxohjl666/CocoScisum_ACL", revision="refs/convert/parquet")
-def convert_to_dict(data):
-    """ Dict:
-        { url:
-            {length:
-                {keywords: summary};
-             raw_text:
-                 str;
-            }
-        }
-    """
-    url = data["url"]
-    text = data["text"]
-    keywords = data["keywords"]
-    length = data["length"]
-    summary = data["summary"]
-    for u, t, k, l, s in zip(url, text, keywords, length, summary):
-        if len(u) < 5:
-            continue
-        u = u + ".pdf"
-        if k == None:
-            k = ""
-        if l == None:
-            l = ""
-        k = str(k).strip()
-        l = str(l).strip()
-        if u in acl_dict.keys():
-            if k in acl_dict[u][l].keys():
-                continue
-            else:
-                acl_dict[u][l][k] = s
-        else:
-            acl_dict[u] = {"": {}, "50": {}, "100": {}, "200": {}, "raw_text": t}
-        # kws
-        if u in recommended_kw.keys():
-            if k == "" or k in recommended_kw[u]:
-                continue
-            else:
-                recommended_kw[u].append(k)
-        else:
-            recommended_kw[u] = []
-    return 1
-for i in acl_data.keys():
-    signal = convert_to_dict(acl_data[i])
-def download_pdf(url, dest_folder):
-    """
-    Download a PDF from a given URL and save it to a specified destination folder.
-    Parameters:
-        url (str): URL of the PDF
-        dest_folder (str): Destination folder to save the downloaded PDF
-    """
-    if not os.path.exists(dest_folder):
-        os.makedirs(dest_folder)
-    response = requests.get(url, stream=True)
-    filename = os.path.join(dest_folder, url.split("/")[-1])
-    with open(filename, 'wb') as file:
-        for chunk in response.iter_content(chunk_size=1024):
-            if chunk:
-                file.write(chunk)
-    print(f"Downloaded {url} to {filename}")
-    return filename
-def ctrlsum_for_str(input, length=None, keywords=None) -> List[Tuple[str, str]]:
     if keywords is not None:
         keywords = keywords.strip().split(",")
         if keywords[0] == "":
             keywords = None
-    if length == 0 or length is None:
         length = None
     results = ctrlsum_pipeline.predict(input, type="str",
-                                       length=length, keywords=keywords, num_beams=1)
     output = []
     for res in results["summary"]:
@@ -108,58 +24,36 @@ def ctrlsum_for_str(input, length=None, keywords=None) -> List[Tuple[str, str]]:
     return "".join(output)
-def ctrlsum_for_file(input=None, length=None, keywords="", text="", url="") -> List[Tuple[str, str, str]]:
-    if input == None and url == "":
-        if text == "":
-            return None, "Input cannot be left blank.", None
         else:
-            return ctrlsum_for_str(text, length, keywords), text, None
     else:
-        filename = ""
-        url = url.strip()
-        if url != "":
-            if len(url) > 4 and url[-3:] == "pdf":
-                if url.strip() in acl_dict.keys():
-                    raw_text = acl_dict[url]["raw_text"]
-                    l = str(length)
-                    if length == 0:
-                        l = ""
-                    if l in acl_dict[url].keys():
-                        if keywords.strip() in acl_dict[url][l].keys():
-                            summary = acl_dict[url][l][keywords]
-                            return summary, raw_text, None
-                    if keywords.strip() == "":
-                        keywords = None
-                    if l == "":
-                        l = None
-                    return ctrlsum_for_str(raw_text, int(l), keywords), raw_text, None
-                filename = download_pdf(url, './cache/')
-            else:
-                "Invalid url(Not PDF)!", None, None
-        else:
-            filename = input.name
-        if keywords != "":
             keywords = keywords.strip().split(",")
             if keywords[0] == "":
                 keywords = None
-        if length == 0:
             length = None
         # Identify the format of input and parse reference strings
         if filename[-4:] == ".txt":
             results = ctrlsum_pipeline.predict(filename, type="txt",
-                                               save_results=False,
-                                               length=length, keywords=keywords, num_beams=1)
         elif filename[-4:] == ".pdf":
             results = ctrlsum_pipeline.predict(filename,
-                                               save_results=False, length=length, keywords=keywords, num_beams=1)
         else:
-            return "File Format Error !", None, filename
         output = []
         for res in results["summary"]:
             output.append(f"{res}\n\n")
-        return "".join(output), results["raw_text"], filename
-ctrlsum_str_example = "Language model pre-training has been shown to be effective for improving many natural language processing tasks ( Dai and Le , 2015 ; Peters et al. , 2018a ; Radford et al. , 2018 ; Howard and Ruder , 2018 ) . These include sentence-level tasks such as natural language inference ( Bowman et al. , 2015 ; Williams et al. , 2018 ) and paraphrasing ( Dolan and Brockett , 2005 ) , which aim to predict the relationships between sentences by analyzing them holistically , as well as token-level tasks such as named entity recognition and question answering , where models are required to produce fine-grained output at the token level ( Tjong Kim Sang and De Meulder , 2003 ; Rajpurkar et al. , 2016 ) . There are two existing strategies for applying pre-trained language representations to downstream tasks : feature-based and fine-tuning . The feature-based approach , such as ELMo ( Peters et al. , 2018a ) , uses task-specific architectures that include the pre-trained representations as additional features . The fine-tuning approach , such as the Generative Pre-trained Transformer ( OpenAI GPT ) ( Radford et al. , 2018 ) , introduces minimal task-specific parameters , and is trained on the downstream tasks by simply fine-tuning all pretrained parameters . The two approaches share the same objective function during pre-training , where they use unidirectional language models to learn general language representations . We argue that current techniques restrict the power of the pre-trained representations , especially for the fine-tuning approaches . The major limitation is that standard language models are unidirectional , and this limits the choice of architectures that can be used during pre-training . For example , in OpenAI GPT , the authors use a left-toright architecture , where every token can only attend to previous tokens in the self-attention layers of the Transformer ( Vaswani et al. , 2017 ) . Such restrictions are sub-optimal for sentence-level tasks , and could be very harmful when applying finetuning based approaches to token-level tasks such as question answering , where it is crucial to incorporate context from both directions . In this paper , we improve the fine-tuning based approaches by proposing BERT : Bidirectional Encoder Representations from Transformers . BERT alleviates the previously mentioned unidirectionality constraint by using a `` masked language model '' ( MLM ) pre-training objective , inspired by the Cloze task ( Taylor , 1953 ) . The masked language model randomly masks some of the tokens from the input , and the objective is to predict the original vocabulary id of the masked arXiv:1810.04805v2 [ cs.CL ] 24 May 2019 word based only on its context . Unlike left-toright language model pre-training , the MLM objective enables the representation to fuse the left and the right context , which allows us to pretrain a deep bidirectional Transformer . In addition to the masked language model , we also use a `` next sentence prediction '' task that jointly pretrains text-pair representations . The contributions of our paper are as follows : • We demonstrate the importance of bidirectional pre-training for language representations . Unlike Radford et al . ( 2018 ) , which uses unidirectional language models for pre-training , BERT uses masked language models to enable pretrained deep bidirectional representations . This is also in contrast to Peters et al . ( 2018a ) , which uses a shallow concatenation of independently trained left-to-right and right-to-left LMs . • We show that pre-trained representations reduce the need for many heavily-engineered taskspecific architectures . BERT is the first finetuning based representation model that achieves state-of-the-art performance on a large suite of sentence-level and token-level tasks , outperforming many task-specific architectures . • BERT advances the state of the art for eleven NLP tasks . The code and pre-trained models are available at https : //github.com/ google-research/bert . "

 from typing import List, Tuple
 import torch
 from SciAssist import Summarization
+device = "gpu" if torch.cuda.is_available() else "cpu"
+ctrlsum_pipeline = Summarization(os_name="nt",checkpoint="google/flan-t5-base",device=device)
+def ctrlsum_for_str(input,length=None, keywords=None) -> List[Tuple[str, str]]:
     if keywords is not None:
         keywords = keywords.strip().split(",")
         if keywords[0] == "":
             keywords = None
+    if length==0 or length is None:
         length = None
     results = ctrlsum_pipeline.predict(input, type="str",
+                                    length=length, keywords=keywords)
     output = []
     for res in results["summary"]:
     return "".join(output)
+def ctrlsum_for_file(input, length=None, keywords=None, text="") -> List[Tuple[str, str]]:
+    if input == None:
+        if text=="":
+            return None
         else:
+            return ctrlsum_for_str(text,length,keywords),text
     else:
+        filename = input.name
+        if keywords is not None:
             keywords = keywords.strip().split(",")
             if keywords[0] == "":
                 keywords = None
+        if length==0:
             length = None
         # Identify the format of input and parse reference strings
         if filename[-4:] == ".txt":
             results = ctrlsum_pipeline.predict(filename, type="txt",
+                                            save_results=False,
+                                            length=length, keywords=keywords)
         elif filename[-4:] == ".pdf":
             results = ctrlsum_pipeline.predict(filename,
+                                            save_results=False, length=length, keywords=keywords)
         else:
+            return [("File Format Error !", None)]
         output = []
         for res in results["summary"]:
             output.append(f"{res}\n\n")
+        return "".join(output), results["raw_text"]
+ctrlsum_str_example = "Language model pre-training has been shown to be effective for improving many natural language processing tasks ( Dai and Le , 2015 ; Peters et al. , 2018a ; Radford et al. , 2018 ; Howard and Ruder , 2018 ) . These include sentence-level tasks such as natural language inference ( Bowman et al. , 2015 ; Williams et al. , 2018 ) and paraphrasing ( Dolan and Brockett , 2005 ) , which aim to predict the relationships between sentences by analyzing them holistically , as well as token-level tasks such as named entity recognition and question answering , where models are required to produce fine-grained output at the token level ( Tjong Kim Sang and De Meulder , 2003 ; Rajpurkar et al. , 2016 ) . There are two existing strategies for applying pre-trained language representations to downstream tasks : feature-based and fine-tuning . The feature-based approach , such as ELMo ( Peters et al. , 2018a ) , uses task-specific architectures that include the pre-trained representations as additional features . The fine-tuning approach , such as the Generative Pre-trained Transformer ( OpenAI GPT ) ( Radford et al. , 2018 ) , introduces minimal task-specific parameters , and is trained on the downstream tasks by simply fine-tuning all pretrained parameters . The two approaches share the same objective function during pre-training , where they use unidirectional language models to learn general language representations . We argue that current techniques restrict the power of the pre-trained representations , especially for the fine-tuning approaches . The major limitation is that standard language models are unidirectional , and this limits the choice of architectures that can be used during pre-training . For example , in OpenAI GPT , the authors use a left-toright architecture , where every token can only attend to previous tokens in the self-attention layers of the Transformer ( Vaswani et al. , 2017 ) . Such restrictions are sub-optimal for sentence-level tasks , and could be very harmful when applying finetuning based approaches to token-level tasks such as question answering , where it is crucial to incorporate context from both directions . In this paper , we improve the fine-tuning based approaches by proposing BERT : Bidirectional Encoder Representations from Transformers . BERT alleviates the previously mentioned unidirectionality constraint by using a `` masked language model '' ( MLM ) pre-training objective , inspired by the Cloze task ( Taylor , 1953 ) . The masked language model randomly masks some of the tokens from the input , and the objective is to predict the original vocabulary id of the masked arXiv:1810.04805v2 [ cs.CL ] 24 May 2019 word based only on its context . Unlike left-toright language model pre-training , the MLM objective enables the representation to fuse the left and the right context , which allows us to pretrain a deep bidirectional Transformer . In addition to the masked language model , we also use a `` next sentence prediction '' task that jointly pretrains text-pair representations . The contributions of our paper are as follows : • We demonstrate the importance of bidirectional pre-training for language representations . Unlike Radford et al . ( 2018 ) , which uses unidirectional language models for pre-training , BERT uses masked language models to enable pretrained deep bidirectional representations . This is also in contrast to Peters et al . ( 2018a ) , which uses a shallow concatenation of independently trained left-to-right and right-to-left LMs . • We show that pre-trained representations reduce the need for many heavily-engineered taskspecific architectures . BERT is the first finetuning based representation model that achieves state-of-the-art performance on a large suite of sentence-level and token-level tasks , outperforming many task-specific architectures . • BERT advances the state of the art for eleven NLP tasks . The code and pre-trained models are available at https : //github.com/ google-research/bert . "

description.py CHANGED Viewed

@@ -1,8 +1,4 @@
 # Reference string parsing Markdown
-rsp_title_md = '''
-## Reference String Parsing parses a citation string, extracting information such as the title, authors, and publication date.
-'''
 rsp_str_md = '''
 To **test on strings**, simply input one or more strings.
 '''
@@ -46,8 +42,6 @@ To **test on strings**, simply input a string.
 ctrlsum_file_md = '''
 This is the demo for **CocoSciSum**.
-## Controlled Summarization uses FLAN-T5 to generate user-customised summaries from your input file or URL link.
 To **test on a file**, the input can be:
 - A txt file which contains the content to be summarized.
@@ -58,9 +52,7 @@ To **test on a file**, the input can be:
 '''
-de_title_md = '''
-## Dataset Extraction detects dataset mentions from the input text.
-'''
 de_str_md = '''
 To **test on strings**, please input your sentences or paragraphs.

 # Reference string parsing Markdown
 rsp_str_md = '''
 To **test on strings**, simply input one or more strings.
 '''
 ctrlsum_file_md = '''
 This is the demo for **CocoSciSum**.
 To **test on a file**, the input can be:
 - A txt file which contains the content to be summarized.
 '''
 de_str_md = '''
 To **test on strings**, please input your sentences or paragraphs.

requirements.txt CHANGED Viewed

@@ -1,6 +1,4 @@
 pip==23.2.1
 torch==1.12.0
-SciAssist==0.1.4
 nltk~=3.7
-pytest
-huggingface-hub==0.27.1

 pip==23.2.1
 torch==1.12.0
+SciAssist==0.0.41
 nltk~=3.7