Spaces:
Sleeping
Sleeping
File size: 8,415 Bytes
70c2b2c abf24f3 70c2b2c abf24f3 70c2b2c e736148 baaf744 70c2b2c abf24f3 70c2b2c 8fc80cf 70c2b2c 8fc80cf 70c2b2c abf24f3 70c2b2c abf24f3 70c2b2c 8fc80cf 70c2b2c abf24f3 70c2b2c abf24f3 70c2b2c abf24f3 70c2b2c abf24f3 70c2b2c 9205504 70c2b2c 9205504 70c2b2c e736148 70c2b2c 9205504 70c2b2c abf24f3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 |
# -*- coding: utf-8 -*-
"""AI-Powered Research Assistant for Scholars and Researchers.ipynb
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/1u8Qvn2TOmOr1hZ_BovZPUA3KCku31OXb
"""
# !pip install gradio requests transformers beautifulsoup4 python-docx torch
"""**Set Up the Environment:** Install the required libraries
**Create the Gradio Frontend:** searching for articles, summarizing content, generating citations
"""
import gradio as gr
import requests
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
def search_related_articles_crossref(query, max_results=3):
"""Search for related articles using CrossRef API."""
try:
url = f"https://api.crossref.org/works?query={query}&rows={max_results}"
headers = {"User-Agent": "AI-Powered Research Assistant ([email protected])"} # Replace with your email
response = requests.get(url, headers=headers)
if response.status_code == 200:
articles = []
data = response.json()
for item in data['message']['items']:
title = item.get('title', ['No Title'])[0]
doi = item.get('DOI', 'No DOI')
link = f"https://doi.org/{doi}"
articles.append({"title": title, "link": link})
print(articles)
if not articles:
print(articles)
return [], "No articles found for the query."
return articles, None
else:
return [], f"Error fetching articles: {response.status_code} - {response.text}"
except Exception as e:
return [], f"Exception during CrossRef API call: {str(e)}"
from bs4 import BeautifulSoup
def extract_text_from_html(url):
"""Extract text content from HTML page."""
try:
response = requests.get(url)
response.raise_for_status() # Check for request errors
soup = BeautifulSoup(response.text, 'html.parser')
# This is a simplified example. You may need to adjust the selector based on the site structure.
paragraphs = soup.find_all('p')
text_content = "\n".join([para.get_text() for para in paragraphs])
return text_content
except Exception as e:
return f"Error extracting text: {str(e)}"
tokenizer = AutoTokenizer.from_pretrained("pszemraj/long-t5-tglobal-base-16384-book-summary")
model = AutoModelForSeq2SeqLM.from_pretrained("pszemraj/long-t5-tglobal-base-16384-book-summary")
def summarize_article(article_text):
"""Summarize a given article's text."""
try:
if not article_text or len(article_text.split()) < 20:
return None, "Article content is too short to summarize."
# Ensure the input text is not too long
inputs = tokenizer(
article_text,
return_tensors="pt",
truncation=True,
max_length=512, # Adjust max_length to control input size
padding="max_length"
)
# Generate the summary
summary_ids = model.generate(
**inputs,
max_new_tokens=400, # Limit the length of the output
min_length=100, # Set a minimum length for the output
# #length_penalty='1.0', # Adjust length penalty to encourage longer output
# no_repeat_ngram_size=3, # Avoid repetition of phrases
early_stopping=True
)
# Decode the output to get the summary
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
return summary, None
except Exception as e:
return None, f"Exception during summarization: {str(e)}"
# Load tokenizer and model
tokenizer_t5 = AutoTokenizer.from_pretrained("scieditor/citation-generation-t5")
model_t5 = AutoModelForSeq2SeqLM.from_pretrained("scieditor/citation-generation-t5")
def generate_citation_t5(article_title, citation_style, article_link):
"""Generate a citation using the T5 or LED model."""
try:
# Prepare the input text with explicit and structured formatting
input_text = (f"'{article_title}'\n"
f"{article_link}\n"
f"Include author names, publication date, title, journal name, and DOI if available.\n"
f"Generate a {citation_style} style citation for the article")
# Tokenize the input
inputs = tokenizer_t5(input_text, return_tensors="pt", truncation=True, padding=True)
# Generate the citation
outputs = model_t5.generate(**inputs, max_new_tokens=70)
# Decode the output to text
citation = tokenizer_t5.decode(outputs[0], skip_special_tokens=True)
return citation, None
except Exception as e:
return None, f"Exception during citation generation: {str(e)}"
from docx import Document
from docx.shared import Pt
from docx.oxml.ns import qn
def create_thesis_document(title, summary, citations):
"""Create a Word document formatted like a PhD thesis."""
# Initialize Document
doc = Document()
# Title Page
doc.add_paragraph(title, style='Title').alignment = 1 # Center alignment
doc.add_paragraph() # Add empty line
# Adding title page details
doc.add_paragraph('Thesis', style='Heading 1').alignment = 1
doc.add_paragraph('Author Name', style='Normal').alignment = 1
doc.add_paragraph('University Name', style='Normal').alignment = 1
doc.add_paragraph('Date', style='Normal').alignment = 1
doc.add_page_break()
# Summary Page
doc.add_paragraph('Summary', style='Heading 1').alignment = 0 # Left alignment
doc.add_paragraph(summary, style='Normal')
doc.add_page_break()
# Citation Page
doc.add_paragraph('Citations', style='Heading 1').alignment = 0
for citation in citations:
doc.add_paragraph(citation, style='Normal')
file_path = "Research_Document.docx"
doc.save(file_path)
return file_path
def research_assistant(research_topic, citation_style):
"""Main function to search, summarize, and generate citations."""
if not research_topic:
return "Please enter a research topic.", ["No summaries generated."], ["No citations generated."]
# Character limit check
if len(research_topic) > 150:
return "Error: Research topic exceeds 150 characters.", [], []
# Search for related articles using CrossRef
articles, error = search_related_articles_crossref(research_topic)
if error:
return error, [], []
summaries = []
citations = []
article_content = ''
for article in articles:
try:
# Fetching article content might not be feasible; consider using metadata
article_content += f"{extract_text_from_html(article['link'])}.\n" # Simplified; actual content may require other methods
citation, error = generate_citation_t5(article['title'], citation_style, article['link'])
if error:
citations.append(f"Error generating citation for '{article['title']}': {error}")
else:
citations.append(citation)
except Exception as e:
summaries.append(f"Error processing article '{article['title']}': {str(e)}")
citations.append(f"Error generating citation for '{article['title']}': {str(e)}")
summary, error = summarize_article(article_content)
if error:
summaries.append(f"Error summarizing article: {error}")
else:
summaries.append(summary)
file_path = create_thesis_document(research_topic, "\n".join(summaries), citations)
return research_topic, summaries, citations, file_path
# Create Gradio Interface with download functionality
gr_interface = gr.Interface(
fn=research_assistant,
inputs=[
gr.Textbox(label="Enter your research topic or question:", placeholder="Enter your research topic (max 150 characters)"),
gr.Dropdown(choices=["APA", "MLA", "Chicago"], label="Choose a citation style:")
],
outputs=[
gr.Textbox(label="Research Topic"),
gr.Textbox(label="Summaries of Articles"),
gr.Textbox(label="Generated Citations"),
gr.DownloadButton(label="Download Document")
],
title="AI-Powered Research Assistant",
allow_flagging="never"
)
gr_interface.launch(share=True) |