smfaiz's picture
Update app.py
abf24f3 verified
# -*- coding: utf-8 -*-
"""AI-Powered Research Assistant for Scholars and Researchers.ipynb
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/1u8Qvn2TOmOr1hZ_BovZPUA3KCku31OXb
"""
# !pip install gradio requests transformers beautifulsoup4 python-docx torch
"""**Set Up the Environment:** Install the required libraries
**Create the Gradio Frontend:** searching for articles, summarizing content, generating citations
"""
import gradio as gr
import requests
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
def search_related_articles_crossref(query, max_results=3):
"""Search for related articles using CrossRef API."""
try:
url = f"https://api.crossref.org/works?query={query}&rows={max_results}"
headers = {"User-Agent": "AI-Powered Research Assistant ([email protected])"} # Replace with your email
response = requests.get(url, headers=headers)
if response.status_code == 200:
articles = []
data = response.json()
for item in data['message']['items']:
title = item.get('title', ['No Title'])[0]
doi = item.get('DOI', 'No DOI')
link = f"https://doi.org/{doi}"
articles.append({"title": title, "link": link})
print(articles)
if not articles:
print(articles)
return [], "No articles found for the query."
return articles, None
else:
return [], f"Error fetching articles: {response.status_code} - {response.text}"
except Exception as e:
return [], f"Exception during CrossRef API call: {str(e)}"
from bs4 import BeautifulSoup
def extract_text_from_html(url):
"""Extract text content from HTML page."""
try:
response = requests.get(url)
response.raise_for_status() # Check for request errors
soup = BeautifulSoup(response.text, 'html.parser')
# This is a simplified example. You may need to adjust the selector based on the site structure.
paragraphs = soup.find_all('p')
text_content = "\n".join([para.get_text() for para in paragraphs])
return text_content
except Exception as e:
return f"Error extracting text: {str(e)}"
tokenizer = AutoTokenizer.from_pretrained("pszemraj/long-t5-tglobal-base-16384-book-summary")
model = AutoModelForSeq2SeqLM.from_pretrained("pszemraj/long-t5-tglobal-base-16384-book-summary")
def summarize_article(article_text):
"""Summarize a given article's text."""
try:
if not article_text or len(article_text.split()) < 20:
return None, "Article content is too short to summarize."
# Ensure the input text is not too long
inputs = tokenizer(
article_text,
return_tensors="pt",
truncation=True,
max_length=512, # Adjust max_length to control input size
padding="max_length"
)
# Generate the summary
summary_ids = model.generate(
**inputs,
max_new_tokens=400, # Limit the length of the output
min_length=100, # Set a minimum length for the output
# #length_penalty='1.0', # Adjust length penalty to encourage longer output
# no_repeat_ngram_size=3, # Avoid repetition of phrases
early_stopping=True
)
# Decode the output to get the summary
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
return summary, None
except Exception as e:
return None, f"Exception during summarization: {str(e)}"
# Load tokenizer and model
tokenizer_t5 = AutoTokenizer.from_pretrained("scieditor/citation-generation-t5")
model_t5 = AutoModelForSeq2SeqLM.from_pretrained("scieditor/citation-generation-t5")
def generate_citation_t5(article_title, citation_style, article_link):
"""Generate a citation using the T5 or LED model."""
try:
# Prepare the input text with explicit and structured formatting
input_text = (f"'{article_title}'\n"
f"{article_link}\n"
f"Include author names, publication date, title, journal name, and DOI if available.\n"
f"Generate a {citation_style} style citation for the article")
# Tokenize the input
inputs = tokenizer_t5(input_text, return_tensors="pt", truncation=True, padding=True)
# Generate the citation
outputs = model_t5.generate(**inputs, max_new_tokens=70)
# Decode the output to text
citation = tokenizer_t5.decode(outputs[0], skip_special_tokens=True)
return citation, None
except Exception as e:
return None, f"Exception during citation generation: {str(e)}"
from docx import Document
from docx.shared import Pt
from docx.oxml.ns import qn
def create_thesis_document(title, summary, citations):
"""Create a Word document formatted like a PhD thesis."""
# Initialize Document
doc = Document()
# Title Page
doc.add_paragraph(title, style='Title').alignment = 1 # Center alignment
doc.add_paragraph() # Add empty line
# Adding title page details
doc.add_paragraph('Thesis', style='Heading 1').alignment = 1
doc.add_paragraph('Author Name', style='Normal').alignment = 1
doc.add_paragraph('University Name', style='Normal').alignment = 1
doc.add_paragraph('Date', style='Normal').alignment = 1
doc.add_page_break()
# Summary Page
doc.add_paragraph('Summary', style='Heading 1').alignment = 0 # Left alignment
doc.add_paragraph(summary, style='Normal')
doc.add_page_break()
# Citation Page
doc.add_paragraph('Citations', style='Heading 1').alignment = 0
for citation in citations:
doc.add_paragraph(citation, style='Normal')
file_path = "Research_Document.docx"
doc.save(file_path)
return file_path
def research_assistant(research_topic, citation_style):
"""Main function to search, summarize, and generate citations."""
if not research_topic:
return "Please enter a research topic.", ["No summaries generated."], ["No citations generated."]
# Character limit check
if len(research_topic) > 150:
return "Error: Research topic exceeds 150 characters.", [], []
# Search for related articles using CrossRef
articles, error = search_related_articles_crossref(research_topic)
if error:
return error, [], []
summaries = []
citations = []
article_content = ''
for article in articles:
try:
# Fetching article content might not be feasible; consider using metadata
article_content += f"{extract_text_from_html(article['link'])}.\n" # Simplified; actual content may require other methods
citation, error = generate_citation_t5(article['title'], citation_style, article['link'])
if error:
citations.append(f"Error generating citation for '{article['title']}': {error}")
else:
citations.append(citation)
except Exception as e:
summaries.append(f"Error processing article '{article['title']}': {str(e)}")
citations.append(f"Error generating citation for '{article['title']}': {str(e)}")
summary, error = summarize_article(article_content)
if error:
summaries.append(f"Error summarizing article: {error}")
else:
summaries.append(summary)
file_path = create_thesis_document(research_topic, "\n".join(summaries), citations)
return research_topic, summaries, citations, file_path
# Create Gradio Interface with download functionality
gr_interface = gr.Interface(
fn=research_assistant,
inputs=[
gr.Textbox(label="Enter your research topic or question:", placeholder="Enter your research topic (max 150 characters)"),
gr.Dropdown(choices=["APA", "MLA", "Chicago"], label="Choose a citation style:")
],
outputs=[
gr.Textbox(label="Research Topic"),
gr.Textbox(label="Summaries of Articles"),
gr.Textbox(label="Generated Citations"),
gr.DownloadButton(label="Download Document")
],
title="AI-Powered Research Assistant",
allow_flagging="never"
)
gr_interface.launch(share=True)