smfaiz commited on
Commit
70c2b2c
·
verified ·
1 Parent(s): c6a33c8

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +241 -0
app.py ADDED
@@ -0,0 +1,241 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """AI-Powered Research Assistant for Scholars and Researchers.ipynb
3
+
4
+ Automatically generated by Colab.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1u8Qvn2TOmOr1hZ_BovZPUA3KCku31OXb
8
+ """
9
+
10
+ !pip install gradio requests transformers beautifulsoup4 python-docx torch
11
+
12
+ """**Set Up the Environment:** Install the required libraries
13
+
14
+ **Create the Gradio Frontend:** searching for articles, summarizing content, generating citations
15
+ """
16
+
17
+ import gradio as gr
18
+ import requests
19
+ from transformers import pipeline
20
+
21
+ # Initialize Hugging Face Summarization and Text Generation Pipelines
22
+ summarizer = pipeline("summarization", model="scieditor/citation-generation-t5")
23
+ citation_generator = pipeline("text-generation", model="gpt2")
24
+
25
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
26
+
27
+ def search_related_articles_crossref(query, max_results=3):
28
+ """Search for related articles using CrossRef API."""
29
+ try:
30
+ url = f"https://api.crossref.org/works?query={query}&rows={max_results}"
31
+ headers = {"User-Agent": "AI-Powered Research Assistant ([email protected])"} # Replace with your email
32
+ response = requests.get(url, headers=headers)
33
+
34
+ if response.status_code == 200:
35
+ articles = []
36
+ data = response.json()
37
+ for item in data['message']['items']:
38
+ title = item.get('title', ['No Title'])[0]
39
+ doi = item.get('DOI', 'No DOI')
40
+ link = f"https://doi.org/{doi}"
41
+ articles.append({"title": title, "link": link})
42
+
43
+ print(articles)
44
+ if not articles:
45
+ print(articles)
46
+ return [], "No articles found for the query."
47
+ return articles, None
48
+ else:
49
+ return [], f"Error fetching articles: {response.status_code} - {response.text}"
50
+ except Exception as e:
51
+ return [], f"Exception during CrossRef API call: {str(e)}"
52
+
53
+ from bs4 import BeautifulSoup
54
+
55
+ def extract_text_from_html(url):
56
+ """Extract text content from HTML page."""
57
+ try:
58
+ response = requests.get(url)
59
+ response.raise_for_status() # Check for request errors
60
+ soup = BeautifulSoup(response.text, 'html.parser')
61
+
62
+ # This is a simplified example. You may need to adjust the selector based on the site structure.
63
+ paragraphs = soup.find_all('p')
64
+ text_content = "\n".join([para.get_text() for para in paragraphs])
65
+
66
+ return text_content
67
+ except Exception as e:
68
+ return f"Error extracting text: {str(e)}"
69
+
70
+ tokenizer_s = AutoTokenizer.from_pretrained("pszemraj/long-t5-tglobal-base-16384-book-summary")
71
+ model_s = AutoModelForSeq2SeqLM.from_pretrained("pszemraj/long-t5-tglobal-base-16384-book-summary")
72
+
73
+ def summarize_article(article_text):
74
+ """Summarize a given article's text."""
75
+ try:
76
+ if not article_text or len(article_text.split()) < 20:
77
+ return None, "Article content is too short to summarize."
78
+ # Ensure the input text is not too long
79
+ inputs = tokenizer_s(
80
+ article_text,
81
+ return_tensors="pt",
82
+ truncation=True,
83
+ max_length=512, # Adjust max_length to control input size
84
+ padding="max_length"
85
+ )
86
+
87
+ # Generate the summary
88
+ summary_ids = model_s.generate(
89
+ **inputs,
90
+ max_new_tokens=400, # Limit the length of the output
91
+ min_length=100, # Set a minimum length for the output
92
+ # #length_penalty='1.0', # Adjust length penalty to encourage longer output
93
+ # no_repeat_ngram_size=3, # Avoid repetition of phrases
94
+ early_stopping=True
95
+ )
96
+
97
+ # Decode the output to get the summary
98
+ summary = tokenizer_s.decode(summary_ids[0], skip_special_tokens=True)
99
+
100
+ return summary, None
101
+ except Exception as e:
102
+ return None, f"Exception during summarization: {str(e)}"
103
+
104
+ # Load tokenizer and model
105
+ tokenizer = AutoTokenizer.from_pretrained("scieditor/citation-generation-t5")
106
+ model = AutoModelForSeq2SeqLM.from_pretrained("scieditor/citation-generation-t5")
107
+
108
+ def generate_citation_t5(article_title, citation_style, article_link):
109
+ """Generate a citation using the T5 or LED model."""
110
+ try:
111
+ # Prepare the input text with explicit and structured formatting
112
+ input_text = (f"'{article_title}'\n"
113
+ f"{article_link}\n"
114
+ f"Include author names, publication date, title, journal name, and DOI if available.\n"
115
+ f"Generate a {citation_style} style citation for the article")
116
+
117
+ # Tokenize the input
118
+ inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True)
119
+
120
+ # Generate the citation
121
+ outputs = model.generate(**inputs, max_new_tokens=70)
122
+
123
+ # Decode the output to text
124
+ citation = tokenizer.decode(outputs[0], skip_special_tokens=True)
125
+ return citation, None
126
+ except Exception as e:
127
+ return None, f"Exception during citation generation: {str(e)}"
128
+
129
+ from docx import Document
130
+ from docx.shared import Pt
131
+ from docx.oxml.ns import qn
132
+
133
+ def create_thesis_document(title, summary, citations):
134
+ """Create a Word document formatted like a PhD thesis."""
135
+
136
+ # Initialize Document
137
+ doc = Document()
138
+
139
+ # Title Page
140
+ doc.add_paragraph(title, style='Title').alignment = 1 # Center alignment
141
+ doc.add_paragraph() # Add empty line
142
+
143
+ # Adding more title page details
144
+ doc.add_paragraph('Thesis', style='Heading 1').alignment = 1
145
+ doc.add_paragraph('Author Name', style='Normal').alignment = 1
146
+ doc.add_paragraph('University Name', style='Normal').alignment = 1
147
+ doc.add_paragraph('Date', style='Normal').alignment = 1
148
+
149
+ doc.add_page_break()
150
+
151
+ # Summary Page
152
+ doc.add_paragraph('Summary', style='Heading 1').alignment = 0 # Left alignment
153
+ doc.add_paragraph(summary, style='Normal')
154
+
155
+ doc.add_page_break()
156
+
157
+ # Citation Page
158
+ doc.add_paragraph('Citations', style='Heading 1').alignment = 0
159
+
160
+ for citation in citations:
161
+ doc.add_paragraph(citation, style='Normal')
162
+
163
+ # Save Document
164
+ # doc.save(f"Research_Document.docx")
165
+
166
+ file_path = "Research_Document.docx"
167
+ doc.save(file_path)
168
+ return file_path
169
+
170
+ # Example Usage
171
+ # title = "Federated Learning\nA Comprehensive Study"
172
+ # summary = "This thesis explores federated learning techniques including quantum natural gradient descent and their applications in various domains. Detailed analysis and results are provided."
173
+ # citations = [
174
+ # "Federated quantum natural gradient descent for quantum federated learning. DOI: https://doi.org/10.1016/b978-0-44-319037-7.00028-4",
175
+ # "Federated Machine Learning for Systems Medicine. DOI: https://doi.org/10.14293/gof.23.06",
176
+ # "Adversarial robustness in federated learning. DOI: https://doi.org/10.1016/b978-0-44-319037-7.00013-2"
177
+ # ]
178
+
179
+ # create_thesis_document(title, summary, citations)
180
+
181
+ def research_assistant(research_topic, citation_style):
182
+ """Main function to search, summarize, and generate citations."""
183
+ if not research_topic:
184
+ return "Please enter a research topic.", ["No summaries generated."], ["No citations generated."]
185
+
186
+ # Character limit check
187
+ if len(research_topic) > 75:
188
+ return "Error: Research topic exceeds 75 characters.", [], []
189
+
190
+ # Search for related articles using CrossRef
191
+ articles, error = search_related_articles_crossref(research_topic)
192
+
193
+ if error:
194
+ return error, [], []
195
+
196
+ summaries = []
197
+ citations = []
198
+ article_content = ''
199
+
200
+ for article in articles:
201
+ try:
202
+ # Fetching article content might not be feasible; consider using metadata
203
+ article_content += f"{extract_text_from_html(article['link'])}.\n" # Simplified; actual content may require other methods
204
+
205
+ citation, error = generate_citation_t5(article['title'], citation_style, article['link'])
206
+ if error:
207
+ citations.append(f"Error generating citation for '{article['title']}': {error}")
208
+ else:
209
+ citations.append(citation)
210
+
211
+ except Exception as e:
212
+ summaries.append(f"Error processing article '{article['title']}': {str(e)}")
213
+ citations.append(f"Error generating citation for '{article['title']}': {str(e)}")
214
+
215
+ summary, error = summarize_article(article_content)
216
+ if error:
217
+ summaries.append(f"Error summarizing article: {error}")
218
+ else:
219
+ summaries.append(summary)
220
+
221
+ file_path = create_thesis_document(research_topic, "\n".join(summaries), citations)
222
+ return research_topic, summaries, citations, file_path
223
+
224
+ # Create Gradio Interface with download functionality
225
+ gr_interface = gr.Interface(
226
+ fn=research_assistant,
227
+ inputs=[
228
+ gr.Textbox(label="Enter your research topic or question:", placeholder="Enter your research topic (max 75 characters)", lines=2),
229
+ gr.Dropdown(choices=["APA", "MLA", "Chicago"], label="Choose a citation style:")
230
+ ],
231
+ outputs=[
232
+ gr.Textbox(label="Research Topic"),
233
+ gr.Textbox(label="Summaries of Articles"),
234
+ gr.Textbox(label="Generated Citations"),
235
+ gr.DownloadButton(label="Download Document")
236
+ ],
237
+ title="AI-Powered Research Assistant",
238
+ allow_flagging="never"
239
+ )
240
+
241
+ gr_interface.launch(debug=True)