Spaces:
Running
Running
Commit
·
2427a02
1
Parent(s):
35c2c5b
Add initial project structure with PDF summarization features and Gradio interface
Browse files- .gitignore +1 -0
- README.md +1 -1
- app.py +54 -0
- main.py +35 -0
- math_summarizer.py +91 -0
- nlp_summarizer.py +56 -0
- requirements.txt +1 -0
- tools.py +19 -0
.gitignore
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
__pycache__
|
README.md
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
---
|
2 |
title: Rexplore Api
|
3 |
-
emoji:
|
4 |
colorFrom: red
|
5 |
colorTo: yellow
|
6 |
sdk: gradio
|
|
|
1 |
---
|
2 |
title: Rexplore Api
|
3 |
+
emoji: 📑
|
4 |
colorFrom: red
|
5 |
colorTo: yellow
|
6 |
sdk: gradio
|
app.py
ADDED
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from main import main
|
3 |
+
|
4 |
+
def rexplore_summarizer(corpus):
|
5 |
+
response = main(corpus)
|
6 |
+
return response, response['summary'], response['mindmap']
|
7 |
+
|
8 |
+
def clear_everything(text_corpus, raw_data, summary, mindmap):
|
9 |
+
return None, None, None, None
|
10 |
+
|
11 |
+
theme = gr.themes.Soft(
|
12 |
+
primary_hue="purple",
|
13 |
+
secondary_hue="cyan",
|
14 |
+
neutral_hue="slate",
|
15 |
+
font=[
|
16 |
+
gr.themes.GoogleFont('Syne'),
|
17 |
+
gr.themes.GoogleFont('Poppins'),
|
18 |
+
gr.themes.GoogleFont('Poppins'),
|
19 |
+
gr.themes.GoogleFont('Poppins')
|
20 |
+
],
|
21 |
+
)
|
22 |
+
|
23 |
+
with gr.Blocks(theme=theme, title="ReXplore Summarizer", fill_height=True) as app:
|
24 |
+
gr.HTML(
|
25 |
+
value ='''
|
26 |
+
<h1 style="text-align: center;">ReXplore Summarizer <p style="text-align: center;">Designed and Developed by <a href='https://raannakasturi.eu.org' target="_blank" rel="nofollow noreferrer external">Nayan Kasturi</a></p> </h1>
|
27 |
+
<p style="text-align: center;">This app uses a hybrid approach to summarize PDF documents based on CPU as well as GPU.</p>
|
28 |
+
<p style="text-align: center;">The app uses traditional methodologies such as TextRank, LSA, Luhn algorithms as well as large language model (LLM) to generate summaries as well as mindmaps.</p>
|
29 |
+
<p style="text-align: center;">The summarization process can take some time depending on the size of the text corpus and the complexity of the content.</p>
|
30 |
+
''')
|
31 |
+
with gr.Row():
|
32 |
+
with gr.Column():
|
33 |
+
text_corpus = gr.TextArea(label="Text Corpus", placeholder="Paste the text corpus here", lines=5)
|
34 |
+
with gr.Row():
|
35 |
+
clear_btn = gr.Button(value="Clear", variant='stop')
|
36 |
+
summarize_btn = gr.Button(value="Summarize", variant='primary')
|
37 |
+
raw_data = gr.TextArea(label="Raw Data", placeholder="The generated raw data will be displayed here", lines=7, interactive=False, show_copy_button=True)
|
38 |
+
with gr.Row():
|
39 |
+
summary = gr.TextArea(label="Summary", placeholder="The generated summary will be displayed here", lines=7, interactive=False, show_copy_button=True)
|
40 |
+
mindmap = gr.TextArea(label="Mindmap", placeholder="The generated mindmap will be displayed here", lines=7, interactive=False, show_copy_button=True)
|
41 |
+
|
42 |
+
summarize_btn.click(
|
43 |
+
rexplore_summarizer,
|
44 |
+
inputs=[text_corpus],
|
45 |
+
outputs=[raw_data, summary, mindmap],
|
46 |
+
concurrency_limit=1,
|
47 |
+
scroll_to_output=True,
|
48 |
+
show_api=True,
|
49 |
+
api_name="rexplore_summarizer",
|
50 |
+
show_progress="full",
|
51 |
+
)
|
52 |
+
clear_btn.click(clear_everything, inputs=[text_corpus, raw_data, summary, mindmap], outputs=[text_corpus, raw_data, summary, mindmap], show_api=False)
|
53 |
+
|
54 |
+
app.queue(default_concurrency_limit=1).launch(show_api=True)
|
main.py
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from math_summarizer import generate_math_summary
|
2 |
+
from nlp_summarizer import generate_nlp_summary_and_mindmap
|
3 |
+
from tools import parse_pdf
|
4 |
+
import openai
|
5 |
+
import time
|
6 |
+
|
7 |
+
def create_client(api_key):
|
8 |
+
client = openai.OpenAI(
|
9 |
+
api_key=api_key,
|
10 |
+
base_url="https://glhf.chat/api/openai/v1",
|
11 |
+
)
|
12 |
+
return client
|
13 |
+
|
14 |
+
def generate_summary(client, corpus):
|
15 |
+
response = {}
|
16 |
+
print("Generating Math Summary")
|
17 |
+
math_summary = generate_math_summary(corpus)
|
18 |
+
if not math_summary:
|
19 |
+
print("Error generating Math Summary")
|
20 |
+
response['summary_status'] = "error"
|
21 |
+
response['summary'] = None
|
22 |
+
response['mindmap_status'] = "success"
|
23 |
+
response['mindmap'] = None
|
24 |
+
return response
|
25 |
+
else:
|
26 |
+
print("Math Summary Generated Successfully")
|
27 |
+
print("Generating NLP Summary and Mindmap")
|
28 |
+
response = generate_nlp_summary_and_mindmap(client, corpus)
|
29 |
+
print("NLP Summary and Mindmap Generated Successfully")
|
30 |
+
return response
|
31 |
+
|
32 |
+
def main(corpus):
|
33 |
+
client = create_client("glhf_0d3c695626fec941eeb7914dd0e36da5")
|
34 |
+
response = generate_summary(client, corpus)
|
35 |
+
return response
|
math_summarizer.py
ADDED
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from sumy.parsers.plaintext import PlaintextParser
|
2 |
+
from sumy.nlp.tokenizers import Tokenizer
|
3 |
+
from sumy.summarizers.text_rank import TextRankSummarizer
|
4 |
+
from sumy.summarizers.luhn import LuhnSummarizer
|
5 |
+
from sumy.summarizers.lex_rank import LexRankSummarizer
|
6 |
+
from sumy.summarizers.lsa import LsaSummarizer
|
7 |
+
from sumy.nlp.stemmers import Stemmer
|
8 |
+
from sumy.utils import get_stop_words
|
9 |
+
import nltk
|
10 |
+
import threading
|
11 |
+
|
12 |
+
LANGUAGE = "english"
|
13 |
+
SENTENCES_COUNT = 15
|
14 |
+
|
15 |
+
def generate_textrank_summary(research_paper_text):
|
16 |
+
nltk.download('punkt', quiet=True)
|
17 |
+
nltk.download('punkt_tab', quiet=True)
|
18 |
+
parser = PlaintextParser.from_string(research_paper_text, Tokenizer(LANGUAGE))
|
19 |
+
stemmer = Stemmer(LANGUAGE)
|
20 |
+
summarizer = TextRankSummarizer(stemmer)
|
21 |
+
summarizer.stop_words = get_stop_words(LANGUAGE)
|
22 |
+
sentences = summarizer(parser.document, SENTENCES_COUNT)
|
23 |
+
summary = ""
|
24 |
+
for sentence in sentences:
|
25 |
+
summary += str(sentence) + ""
|
26 |
+
return summary
|
27 |
+
|
28 |
+
def generate_luhn_summary(research_paper_text):
|
29 |
+
nltk.download('punkt', quiet=True)
|
30 |
+
nltk.download('punkt_tab', quiet=True)
|
31 |
+
parser = PlaintextParser.from_string(research_paper_text, Tokenizer(LANGUAGE))
|
32 |
+
stemmer = Stemmer(LANGUAGE)
|
33 |
+
summarizer = LuhnSummarizer(stemmer)
|
34 |
+
summarizer.stop_words = get_stop_words(LANGUAGE)
|
35 |
+
sentences = summarizer(parser.document, SENTENCES_COUNT)
|
36 |
+
summary = ""
|
37 |
+
for sentence in sentences:
|
38 |
+
summary += str(sentence) + ""
|
39 |
+
return summary
|
40 |
+
|
41 |
+
def generate_lsa_summary(research_paper_text):
|
42 |
+
nltk.download('punkt', quiet=True)
|
43 |
+
nltk.download('punkt_tab', quiet=True)
|
44 |
+
parser = PlaintextParser.from_string(research_paper_text, Tokenizer(LANGUAGE))
|
45 |
+
stemmer = Stemmer(LANGUAGE)
|
46 |
+
summarizer = LsaSummarizer(stemmer)
|
47 |
+
summarizer.stop_words = get_stop_words(LANGUAGE)
|
48 |
+
sentences = summarizer(parser.document, SENTENCES_COUNT)
|
49 |
+
summary = ""
|
50 |
+
for sentence in sentences:
|
51 |
+
summary += str(sentence) + ""
|
52 |
+
return summary
|
53 |
+
|
54 |
+
def sanitize_text(input_string):
|
55 |
+
try:
|
56 |
+
encoded_bytes = input_string.encode('utf-8')
|
57 |
+
decoded_string = encoded_bytes.decode('utf-8')
|
58 |
+
return decoded_string
|
59 |
+
except UnicodeEncodeError as e:
|
60 |
+
print(f"Encoding error: {e}")
|
61 |
+
raise
|
62 |
+
except UnicodeDecodeError as e:
|
63 |
+
print(f"Decoding error: {e}")
|
64 |
+
raise
|
65 |
+
|
66 |
+
def generate_math_summary(research_paper_text):
|
67 |
+
sanitized_text = sanitize_text(research_paper_text)
|
68 |
+
try:
|
69 |
+
textrank_summary = luhn_summary = lsa_summary = None
|
70 |
+
def run_textrank():
|
71 |
+
nonlocal textrank_summary
|
72 |
+
textrank_summary = generate_textrank_summary(sanitized_text)
|
73 |
+
def run_luhn():
|
74 |
+
nonlocal luhn_summary
|
75 |
+
luhn_summary = generate_luhn_summary(sanitized_text)
|
76 |
+
def run_lsa():
|
77 |
+
nonlocal lsa_summary
|
78 |
+
lsa_summary = generate_lsa_summary(sanitized_text)
|
79 |
+
threads = []
|
80 |
+
threads.append(threading.Thread(target=run_textrank))
|
81 |
+
threads.append(threading.Thread(target=run_luhn))
|
82 |
+
threads.append(threading.Thread(target=run_lsa))
|
83 |
+
for thread in threads:
|
84 |
+
thread.start()
|
85 |
+
for thread in threads:
|
86 |
+
thread.join()
|
87 |
+
math_summary = textrank_summary.replace("\n", "") + f"\n {'-'*30} \n" + luhn_summary.replace("\n", "") + f"\n {'-'*30} \n" + lsa_summary.replace("\n", "")
|
88 |
+
return math_summary
|
89 |
+
except Exception as e:
|
90 |
+
print(e)
|
91 |
+
return False
|
nlp_summarizer.py
ADDED
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import threading
|
2 |
+
|
3 |
+
def generate_nlp_summary(client, temp_summary):
|
4 |
+
print("Generating NLP Summary")
|
5 |
+
try:
|
6 |
+
completion = client.chat.completions.create(
|
7 |
+
model="hf:meta-llama/Meta-Llama-3.1-405B-Instruct",
|
8 |
+
messages=[
|
9 |
+
{"role": "system", "content": "You are a helpful research assistant for generating well-formatted summaries from scientific research papers."},
|
10 |
+
{"role": "user", "content": f'As a text script expert, please help me to write a short text script with the topic \" {temp_summary}\".You have three tasks, which are:\\n 1.to summarize the text I provided into a Summary .Please answer within 150-300 characters.\\n 2.to summarize the text I provided, using up to seven Highlight.\\n 3.to summarize the text I provided, using up to seven Key Insights. Each insight should include a brief in-depth analysis. Key Insight should not include timestamps.\\n Your output should use the following template strictly, provide the results for the three tasks:\\n ## Summary\\n ## Highlights\\n - Highlights\\n ## Key Insights\\n - Key Insights .\\n Importantly your output must use language \"English\"'}
|
11 |
+
]
|
12 |
+
)
|
13 |
+
return completion.choices[0].message.content
|
14 |
+
except Exception as e:
|
15 |
+
return False
|
16 |
+
|
17 |
+
def generate_nlp_mindmap(client, temp_summary):
|
18 |
+
print("Generating NLP Mindmap")
|
19 |
+
try:
|
20 |
+
completion = client.chat.completions.create(
|
21 |
+
model="hf:meta-llama/Meta-Llama-3.1-405B-Instruct",
|
22 |
+
messages=[
|
23 |
+
{"role": "system", "content": "You are a helpful research assistant for generating well-formatted mindmaps from scientific research papers."},
|
24 |
+
{"role": "user", "content": f'As a text script expert, please help me to write a short text script with the topic \"{temp_summary}\".Your output should use the following template:\\n\\n## {{Subtitle01}}\\n- {{Bulletpoint01}}\\n- {{Bulletpoint02}}\\n## {{Subtitle02}}\\n- {{Bulletpoint03}}\\n- {{Bulletpoint04}}\\n\\nSummarize the giving topic to generate a mind map (as many subtitles as possible, with a minimum of three subtitles) structure markdown. Do not include anything in the response, that is not the part of mindmap.\\n Most Importantly your output must use language \"English\" and each point or pointer should include no more than 9 words.'}
|
25 |
+
]
|
26 |
+
)
|
27 |
+
return completion.choices[0].message.content
|
28 |
+
except Exception as e:
|
29 |
+
return False
|
30 |
+
|
31 |
+
def generate_nlp_summary_and_mindmap(client, temp_summary):
|
32 |
+
response = {}
|
33 |
+
def local_generate_nlp_summary():
|
34 |
+
nlp_summary = generate_nlp_summary(client, temp_summary)
|
35 |
+
if not nlp_summary:
|
36 |
+
response['summary_status'] = "error"
|
37 |
+
response['summary'] = None
|
38 |
+
else:
|
39 |
+
response['summary_status'] = "success"
|
40 |
+
response['summary'] = nlp_summary
|
41 |
+
def local_generate_nlp_mindmap():
|
42 |
+
nlp_mindmap = generate_nlp_mindmap(client, temp_summary)
|
43 |
+
if not nlp_mindmap:
|
44 |
+
response['mindmap_status'] = "error"
|
45 |
+
response['mindmap'] = None
|
46 |
+
else:
|
47 |
+
response['mindmap_status'] = "success"
|
48 |
+
response['mindmap'] = nlp_mindmap
|
49 |
+
threads = []
|
50 |
+
threads.append(threading.Thread(target=local_generate_nlp_summary))
|
51 |
+
threads.append(threading.Thread(target=local_generate_nlp_mindmap))
|
52 |
+
for thread in threads:
|
53 |
+
thread.start()
|
54 |
+
for thread in threads:
|
55 |
+
thread.join()
|
56 |
+
return response
|
requirements.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
openai
|
tools.py
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain_community.document_loaders import PyPDFLoader
|
2 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
3 |
+
|
4 |
+
def parse_pdf(pdf_path):
|
5 |
+
loader = PyPDFLoader(pdf_path)
|
6 |
+
pages = loader.load_and_split()
|
7 |
+
all_text = " ".join([page.page_content for page in pages])
|
8 |
+
start_index = all_text.find("ABSTRACT")
|
9 |
+
end_index = all_text.find("REFERENCES")
|
10 |
+
if start_index != -1 and end_index != -1 and start_index < end_index:
|
11 |
+
relevant_text = all_text[start_index:end_index]
|
12 |
+
else:
|
13 |
+
relevant_text = all_text
|
14 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=250, chunk_overlap=50)
|
15 |
+
text_list = text_splitter.split_text(relevant_text)
|
16 |
+
research_paper_text = "".join(text_list)
|
17 |
+
length_of_research_paper = len(research_paper_text)
|
18 |
+
print(f"Length of Research Paper: {length_of_research_paper}")
|
19 |
+
return research_paper_text
|