Spaces:
Sleeping
Sleeping
# -*- coding: utf-8 -*- | |
"""AI-Powered Research Assistant for Scholars and Researchers.ipynb | |
Automatically generated by Colab. | |
Original file is located at | |
https://colab.research.google.com/drive/1u8Qvn2TOmOr1hZ_BovZPUA3KCku31OXb | |
""" | |
# !pip install gradio requests transformers beautifulsoup4 python-docx torch | |
"""**Set Up the Environment:** Install the required libraries | |
**Create the Gradio Frontend:** searching for articles, summarizing content, generating citations | |
""" | |
import gradio as gr | |
import requests | |
from transformers import pipeline | |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM | |
def search_related_articles_crossref(query, max_results=3): | |
"""Search for related articles using CrossRef API.""" | |
try: | |
url = f"https://api.crossref.org/works?query={query}&rows={max_results}" | |
headers = {"User-Agent": "AI-Powered Research Assistant ([email protected])"} # Replace with your email | |
response = requests.get(url, headers=headers) | |
if response.status_code == 200: | |
articles = [] | |
data = response.json() | |
for item in data['message']['items']: | |
title = item.get('title', ['No Title'])[0] | |
doi = item.get('DOI', 'No DOI') | |
link = f"https://doi.org/{doi}" | |
articles.append({"title": title, "link": link}) | |
print(articles) | |
if not articles: | |
print(articles) | |
return [], "No articles found for the query." | |
return articles, None | |
else: | |
return [], f"Error fetching articles: {response.status_code} - {response.text}" | |
except Exception as e: | |
return [], f"Exception during CrossRef API call: {str(e)}" | |
from bs4 import BeautifulSoup | |
def extract_text_from_html(url): | |
"""Extract text content from HTML page.""" | |
try: | |
response = requests.get(url) | |
response.raise_for_status() # Check for request errors | |
soup = BeautifulSoup(response.text, 'html.parser') | |
# This is a simplified example. You may need to adjust the selector based on the site structure. | |
paragraphs = soup.find_all('p') | |
text_content = "\n".join([para.get_text() for para in paragraphs]) | |
return text_content | |
except Exception as e: | |
return f"Error extracting text: {str(e)}" | |
tokenizer = AutoTokenizer.from_pretrained("pszemraj/long-t5-tglobal-base-16384-book-summary") | |
model = AutoModelForSeq2SeqLM.from_pretrained("pszemraj/long-t5-tglobal-base-16384-book-summary") | |
def summarize_article(article_text): | |
"""Summarize a given article's text.""" | |
try: | |
if not article_text or len(article_text.split()) < 20: | |
return None, "Article content is too short to summarize." | |
# Ensure the input text is not too long | |
inputs = tokenizer( | |
article_text, | |
return_tensors="pt", | |
truncation=True, | |
max_length=512, # Adjust max_length to control input size | |
padding="max_length" | |
) | |
# Generate the summary | |
summary_ids = model.generate( | |
**inputs, | |
max_new_tokens=400, # Limit the length of the output | |
min_length=100, # Set a minimum length for the output | |
# #length_penalty='1.0', # Adjust length penalty to encourage longer output | |
# no_repeat_ngram_size=3, # Avoid repetition of phrases | |
early_stopping=True | |
) | |
# Decode the output to get the summary | |
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True) | |
return summary, None | |
except Exception as e: | |
return None, f"Exception during summarization: {str(e)}" | |
# Load tokenizer and model | |
tokenizer_t5 = AutoTokenizer.from_pretrained("scieditor/citation-generation-t5") | |
model_t5 = AutoModelForSeq2SeqLM.from_pretrained("scieditor/citation-generation-t5") | |
def generate_citation_t5(article_title, citation_style, article_link): | |
"""Generate a citation using the T5 or LED model.""" | |
try: | |
# Prepare the input text with explicit and structured formatting | |
input_text = (f"'{article_title}'\n" | |
f"{article_link}\n" | |
f"Include author names, publication date, title, journal name, and DOI if available.\n" | |
f"Generate a {citation_style} style citation for the article") | |
# Tokenize the input | |
inputs = tokenizer_t5(input_text, return_tensors="pt", truncation=True, padding=True) | |
# Generate the citation | |
outputs = model_t5.generate(**inputs, max_new_tokens=70) | |
# Decode the output to text | |
citation = tokenizer_t5.decode(outputs[0], skip_special_tokens=True) | |
return citation, None | |
except Exception as e: | |
return None, f"Exception during citation generation: {str(e)}" | |
from docx import Document | |
from docx.shared import Pt | |
from docx.oxml.ns import qn | |
def create_thesis_document(title, summary, citations): | |
"""Create a Word document formatted like a PhD thesis.""" | |
# Initialize Document | |
doc = Document() | |
# Title Page | |
doc.add_paragraph(title, style='Title').alignment = 1 # Center alignment | |
doc.add_paragraph() # Add empty line | |
# Adding title page details | |
doc.add_paragraph('Thesis', style='Heading 1').alignment = 1 | |
doc.add_paragraph('Author Name', style='Normal').alignment = 1 | |
doc.add_paragraph('University Name', style='Normal').alignment = 1 | |
doc.add_paragraph('Date', style='Normal').alignment = 1 | |
doc.add_page_break() | |
# Summary Page | |
doc.add_paragraph('Summary', style='Heading 1').alignment = 0 # Left alignment | |
doc.add_paragraph(summary, style='Normal') | |
doc.add_page_break() | |
# Citation Page | |
doc.add_paragraph('Citations', style='Heading 1').alignment = 0 | |
for citation in citations: | |
doc.add_paragraph(citation, style='Normal') | |
file_path = "Research_Document.docx" | |
doc.save(file_path) | |
return file_path | |
def research_assistant(research_topic, citation_style): | |
"""Main function to search, summarize, and generate citations.""" | |
if not research_topic: | |
return "Please enter a research topic.", ["No summaries generated."], ["No citations generated."] | |
# Character limit check | |
if len(research_topic) > 150: | |
return "Error: Research topic exceeds 150 characters.", [], [] | |
# Search for related articles using CrossRef | |
articles, error = search_related_articles_crossref(research_topic) | |
if error: | |
return error, [], [] | |
summaries = [] | |
citations = [] | |
article_content = '' | |
for article in articles: | |
try: | |
# Fetching article content might not be feasible; consider using metadata | |
article_content += f"{extract_text_from_html(article['link'])}.\n" # Simplified; actual content may require other methods | |
citation, error = generate_citation_t5(article['title'], citation_style, article['link']) | |
if error: | |
citations.append(f"Error generating citation for '{article['title']}': {error}") | |
else: | |
citations.append(citation) | |
except Exception as e: | |
summaries.append(f"Error processing article '{article['title']}': {str(e)}") | |
citations.append(f"Error generating citation for '{article['title']}': {str(e)}") | |
summary, error = summarize_article(article_content) | |
if error: | |
summaries.append(f"Error summarizing article: {error}") | |
else: | |
summaries.append(summary) | |
file_path = create_thesis_document(research_topic, "\n".join(summaries), citations) | |
return research_topic, summaries, citations, file_path | |
# Create Gradio Interface with download functionality | |
gr_interface = gr.Interface( | |
fn=research_assistant, | |
inputs=[ | |
gr.Textbox(label="Enter your research topic or question:", placeholder="Enter your research topic (max 150 characters)"), | |
gr.Dropdown(choices=["APA", "MLA", "Chicago"], label="Choose a citation style:") | |
], | |
outputs=[ | |
gr.Textbox(label="Research Topic"), | |
gr.Textbox(label="Summaries of Articles"), | |
gr.Textbox(label="Generated Citations"), | |
gr.DownloadButton(label="Download Document") | |
], | |
title="AI-Powered Research Assistant", | |
allow_flagging="never" | |
) | |
gr_interface.launch(share=True) |