raannakasturi commited on
Commit
2427a02
·
1 Parent(s): 35c2c5b

Add initial project structure with PDF summarization features and Gradio interface

Browse files
Files changed (8) hide show
  1. .gitignore +1 -0
  2. README.md +1 -1
  3. app.py +54 -0
  4. main.py +35 -0
  5. math_summarizer.py +91 -0
  6. nlp_summarizer.py +56 -0
  7. requirements.txt +1 -0
  8. tools.py +19 -0
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ __pycache__
README.md CHANGED
@@ -1,6 +1,6 @@
1
  ---
2
  title: Rexplore Api
3
- emoji: 🌖
4
  colorFrom: red
5
  colorTo: yellow
6
  sdk: gradio
 
1
  ---
2
  title: Rexplore Api
3
+ emoji: 📑
4
  colorFrom: red
5
  colorTo: yellow
6
  sdk: gradio
app.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from main import main
3
+
4
+ def rexplore_summarizer(corpus):
5
+ response = main(corpus)
6
+ return response, response['summary'], response['mindmap']
7
+
8
+ def clear_everything(text_corpus, raw_data, summary, mindmap):
9
+ return None, None, None, None
10
+
11
+ theme = gr.themes.Soft(
12
+ primary_hue="purple",
13
+ secondary_hue="cyan",
14
+ neutral_hue="slate",
15
+ font=[
16
+ gr.themes.GoogleFont('Syne'),
17
+ gr.themes.GoogleFont('Poppins'),
18
+ gr.themes.GoogleFont('Poppins'),
19
+ gr.themes.GoogleFont('Poppins')
20
+ ],
21
+ )
22
+
23
+ with gr.Blocks(theme=theme, title="ReXplore Summarizer", fill_height=True) as app:
24
+ gr.HTML(
25
+ value ='''
26
+ <h1 style="text-align: center;">ReXplore Summarizer <p style="text-align: center;">Designed and Developed by <a href='https://raannakasturi.eu.org' target="_blank" rel="nofollow noreferrer external">Nayan Kasturi</a></p> </h1>
27
+ <p style="text-align: center;">This app uses a hybrid approach to summarize PDF documents based on CPU as well as GPU.</p>
28
+ <p style="text-align: center;">The app uses traditional methodologies such as TextRank, LSA, Luhn algorithms as well as large language model (LLM) to generate summaries as well as mindmaps.</p>
29
+ <p style="text-align: center;">The summarization process can take some time depending on the size of the text corpus and the complexity of the content.</p>
30
+ ''')
31
+ with gr.Row():
32
+ with gr.Column():
33
+ text_corpus = gr.TextArea(label="Text Corpus", placeholder="Paste the text corpus here", lines=5)
34
+ with gr.Row():
35
+ clear_btn = gr.Button(value="Clear", variant='stop')
36
+ summarize_btn = gr.Button(value="Summarize", variant='primary')
37
+ raw_data = gr.TextArea(label="Raw Data", placeholder="The generated raw data will be displayed here", lines=7, interactive=False, show_copy_button=True)
38
+ with gr.Row():
39
+ summary = gr.TextArea(label="Summary", placeholder="The generated summary will be displayed here", lines=7, interactive=False, show_copy_button=True)
40
+ mindmap = gr.TextArea(label="Mindmap", placeholder="The generated mindmap will be displayed here", lines=7, interactive=False, show_copy_button=True)
41
+
42
+ summarize_btn.click(
43
+ rexplore_summarizer,
44
+ inputs=[text_corpus],
45
+ outputs=[raw_data, summary, mindmap],
46
+ concurrency_limit=1,
47
+ scroll_to_output=True,
48
+ show_api=True,
49
+ api_name="rexplore_summarizer",
50
+ show_progress="full",
51
+ )
52
+ clear_btn.click(clear_everything, inputs=[text_corpus, raw_data, summary, mindmap], outputs=[text_corpus, raw_data, summary, mindmap], show_api=False)
53
+
54
+ app.queue(default_concurrency_limit=1).launch(show_api=True)
main.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from math_summarizer import generate_math_summary
2
+ from nlp_summarizer import generate_nlp_summary_and_mindmap
3
+ from tools import parse_pdf
4
+ import openai
5
+ import time
6
+
7
+ def create_client(api_key):
8
+ client = openai.OpenAI(
9
+ api_key=api_key,
10
+ base_url="https://glhf.chat/api/openai/v1",
11
+ )
12
+ return client
13
+
14
+ def generate_summary(client, corpus):
15
+ response = {}
16
+ print("Generating Math Summary")
17
+ math_summary = generate_math_summary(corpus)
18
+ if not math_summary:
19
+ print("Error generating Math Summary")
20
+ response['summary_status'] = "error"
21
+ response['summary'] = None
22
+ response['mindmap_status'] = "success"
23
+ response['mindmap'] = None
24
+ return response
25
+ else:
26
+ print("Math Summary Generated Successfully")
27
+ print("Generating NLP Summary and Mindmap")
28
+ response = generate_nlp_summary_and_mindmap(client, corpus)
29
+ print("NLP Summary and Mindmap Generated Successfully")
30
+ return response
31
+
32
+ def main(corpus):
33
+ client = create_client("glhf_0d3c695626fec941eeb7914dd0e36da5")
34
+ response = generate_summary(client, corpus)
35
+ return response
math_summarizer.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sumy.parsers.plaintext import PlaintextParser
2
+ from sumy.nlp.tokenizers import Tokenizer
3
+ from sumy.summarizers.text_rank import TextRankSummarizer
4
+ from sumy.summarizers.luhn import LuhnSummarizer
5
+ from sumy.summarizers.lex_rank import LexRankSummarizer
6
+ from sumy.summarizers.lsa import LsaSummarizer
7
+ from sumy.nlp.stemmers import Stemmer
8
+ from sumy.utils import get_stop_words
9
+ import nltk
10
+ import threading
11
+
12
+ LANGUAGE = "english"
13
+ SENTENCES_COUNT = 15
14
+
15
+ def generate_textrank_summary(research_paper_text):
16
+ nltk.download('punkt', quiet=True)
17
+ nltk.download('punkt_tab', quiet=True)
18
+ parser = PlaintextParser.from_string(research_paper_text, Tokenizer(LANGUAGE))
19
+ stemmer = Stemmer(LANGUAGE)
20
+ summarizer = TextRankSummarizer(stemmer)
21
+ summarizer.stop_words = get_stop_words(LANGUAGE)
22
+ sentences = summarizer(parser.document, SENTENCES_COUNT)
23
+ summary = ""
24
+ for sentence in sentences:
25
+ summary += str(sentence) + ""
26
+ return summary
27
+
28
+ def generate_luhn_summary(research_paper_text):
29
+ nltk.download('punkt', quiet=True)
30
+ nltk.download('punkt_tab', quiet=True)
31
+ parser = PlaintextParser.from_string(research_paper_text, Tokenizer(LANGUAGE))
32
+ stemmer = Stemmer(LANGUAGE)
33
+ summarizer = LuhnSummarizer(stemmer)
34
+ summarizer.stop_words = get_stop_words(LANGUAGE)
35
+ sentences = summarizer(parser.document, SENTENCES_COUNT)
36
+ summary = ""
37
+ for sentence in sentences:
38
+ summary += str(sentence) + ""
39
+ return summary
40
+
41
+ def generate_lsa_summary(research_paper_text):
42
+ nltk.download('punkt', quiet=True)
43
+ nltk.download('punkt_tab', quiet=True)
44
+ parser = PlaintextParser.from_string(research_paper_text, Tokenizer(LANGUAGE))
45
+ stemmer = Stemmer(LANGUAGE)
46
+ summarizer = LsaSummarizer(stemmer)
47
+ summarizer.stop_words = get_stop_words(LANGUAGE)
48
+ sentences = summarizer(parser.document, SENTENCES_COUNT)
49
+ summary = ""
50
+ for sentence in sentences:
51
+ summary += str(sentence) + ""
52
+ return summary
53
+
54
+ def sanitize_text(input_string):
55
+ try:
56
+ encoded_bytes = input_string.encode('utf-8')
57
+ decoded_string = encoded_bytes.decode('utf-8')
58
+ return decoded_string
59
+ except UnicodeEncodeError as e:
60
+ print(f"Encoding error: {e}")
61
+ raise
62
+ except UnicodeDecodeError as e:
63
+ print(f"Decoding error: {e}")
64
+ raise
65
+
66
+ def generate_math_summary(research_paper_text):
67
+ sanitized_text = sanitize_text(research_paper_text)
68
+ try:
69
+ textrank_summary = luhn_summary = lsa_summary = None
70
+ def run_textrank():
71
+ nonlocal textrank_summary
72
+ textrank_summary = generate_textrank_summary(sanitized_text)
73
+ def run_luhn():
74
+ nonlocal luhn_summary
75
+ luhn_summary = generate_luhn_summary(sanitized_text)
76
+ def run_lsa():
77
+ nonlocal lsa_summary
78
+ lsa_summary = generate_lsa_summary(sanitized_text)
79
+ threads = []
80
+ threads.append(threading.Thread(target=run_textrank))
81
+ threads.append(threading.Thread(target=run_luhn))
82
+ threads.append(threading.Thread(target=run_lsa))
83
+ for thread in threads:
84
+ thread.start()
85
+ for thread in threads:
86
+ thread.join()
87
+ math_summary = textrank_summary.replace("\n", "") + f"\n {'-'*30} \n" + luhn_summary.replace("\n", "") + f"\n {'-'*30} \n" + lsa_summary.replace("\n", "")
88
+ return math_summary
89
+ except Exception as e:
90
+ print(e)
91
+ return False
nlp_summarizer.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import threading
2
+
3
+ def generate_nlp_summary(client, temp_summary):
4
+ print("Generating NLP Summary")
5
+ try:
6
+ completion = client.chat.completions.create(
7
+ model="hf:meta-llama/Meta-Llama-3.1-405B-Instruct",
8
+ messages=[
9
+ {"role": "system", "content": "You are a helpful research assistant for generating well-formatted summaries from scientific research papers."},
10
+ {"role": "user", "content": f'As a text script expert, please help me to write a short text script with the topic \" {temp_summary}\".You have three tasks, which are:\\n 1.to summarize the text I provided into a Summary .Please answer within 150-300 characters.\\n 2.to summarize the text I provided, using up to seven Highlight.\\n 3.to summarize the text I provided, using up to seven Key Insights. Each insight should include a brief in-depth analysis. Key Insight should not include timestamps.\\n Your output should use the following template strictly, provide the results for the three tasks:\\n ## Summary\\n ## Highlights\\n - Highlights\\n ## Key Insights\\n - Key Insights .\\n Importantly your output must use language \"English\"'}
11
+ ]
12
+ )
13
+ return completion.choices[0].message.content
14
+ except Exception as e:
15
+ return False
16
+
17
+ def generate_nlp_mindmap(client, temp_summary):
18
+ print("Generating NLP Mindmap")
19
+ try:
20
+ completion = client.chat.completions.create(
21
+ model="hf:meta-llama/Meta-Llama-3.1-405B-Instruct",
22
+ messages=[
23
+ {"role": "system", "content": "You are a helpful research assistant for generating well-formatted mindmaps from scientific research papers."},
24
+ {"role": "user", "content": f'As a text script expert, please help me to write a short text script with the topic \"{temp_summary}\".Your output should use the following template:\\n\\n## {{Subtitle01}}\\n- {{Bulletpoint01}}\\n- {{Bulletpoint02}}\\n## {{Subtitle02}}\\n- {{Bulletpoint03}}\\n- {{Bulletpoint04}}\\n\\nSummarize the giving topic to generate a mind map (as many subtitles as possible, with a minimum of three subtitles) structure markdown. Do not include anything in the response, that is not the part of mindmap.\\n Most Importantly your output must use language \"English\" and each point or pointer should include no more than 9 words.'}
25
+ ]
26
+ )
27
+ return completion.choices[0].message.content
28
+ except Exception as e:
29
+ return False
30
+
31
+ def generate_nlp_summary_and_mindmap(client, temp_summary):
32
+ response = {}
33
+ def local_generate_nlp_summary():
34
+ nlp_summary = generate_nlp_summary(client, temp_summary)
35
+ if not nlp_summary:
36
+ response['summary_status'] = "error"
37
+ response['summary'] = None
38
+ else:
39
+ response['summary_status'] = "success"
40
+ response['summary'] = nlp_summary
41
+ def local_generate_nlp_mindmap():
42
+ nlp_mindmap = generate_nlp_mindmap(client, temp_summary)
43
+ if not nlp_mindmap:
44
+ response['mindmap_status'] = "error"
45
+ response['mindmap'] = None
46
+ else:
47
+ response['mindmap_status'] = "success"
48
+ response['mindmap'] = nlp_mindmap
49
+ threads = []
50
+ threads.append(threading.Thread(target=local_generate_nlp_summary))
51
+ threads.append(threading.Thread(target=local_generate_nlp_mindmap))
52
+ for thread in threads:
53
+ thread.start()
54
+ for thread in threads:
55
+ thread.join()
56
+ return response
requirements.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ openai
tools.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_community.document_loaders import PyPDFLoader
2
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
3
+
4
+ def parse_pdf(pdf_path):
5
+ loader = PyPDFLoader(pdf_path)
6
+ pages = loader.load_and_split()
7
+ all_text = " ".join([page.page_content for page in pages])
8
+ start_index = all_text.find("ABSTRACT")
9
+ end_index = all_text.find("REFERENCES")
10
+ if start_index != -1 and end_index != -1 and start_index < end_index:
11
+ relevant_text = all_text[start_index:end_index]
12
+ else:
13
+ relevant_text = all_text
14
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=250, chunk_overlap=50)
15
+ text_list = text_splitter.split_text(relevant_text)
16
+ research_paper_text = "".join(text_list)
17
+ length_of_research_paper = len(research_paper_text)
18
+ print(f"Length of Research Paper: {length_of_research_paper}")
19
+ return research_paper_text