# -*- coding: utf-8 -*- """AI-Powered Research Assistant for Scholars and Researchers.ipynb Automatically generated by Colab. Original file is located at https://colab.research.google.com/drive/1u8Qvn2TOmOr1hZ_BovZPUA3KCku31OXb """ # !pip install gradio requests transformers beautifulsoup4 python-docx torch """**Set Up the Environment:** Install the required libraries **Create the Gradio Frontend:** searching for articles, summarizing content, generating citations """ import gradio as gr import requests from transformers import pipeline from transformers import AutoTokenizer, AutoModelForSeq2SeqLM def search_related_articles_crossref(query, max_results=3): """Search for related articles using CrossRef API.""" try: url = f"https://api.crossref.org/works?query={query}&rows={max_results}" headers = {"User-Agent": "AI-Powered Research Assistant (your-email@example.com)"} # Replace with your email response = requests.get(url, headers=headers) if response.status_code == 200: articles = [] data = response.json() for item in data['message']['items']: title = item.get('title', ['No Title'])[0] doi = item.get('DOI', 'No DOI') link = f"https://doi.org/{doi}" articles.append({"title": title, "link": link}) print(articles) if not articles: print(articles) return [], "No articles found for the query." return articles, None else: return [], f"Error fetching articles: {response.status_code} - {response.text}" except Exception as e: return [], f"Exception during CrossRef API call: {str(e)}" from bs4 import BeautifulSoup def extract_text_from_html(url): """Extract text content from HTML page.""" try: response = requests.get(url) response.raise_for_status() # Check for request errors soup = BeautifulSoup(response.text, 'html.parser') # This is a simplified example. You may need to adjust the selector based on the site structure. paragraphs = soup.find_all('p') text_content = "\n".join([para.get_text() for para in paragraphs]) return text_content except Exception as e: return f"Error extracting text: {str(e)}" tokenizer = AutoTokenizer.from_pretrained("pszemraj/long-t5-tglobal-base-16384-book-summary") model = AutoModelForSeq2SeqLM.from_pretrained("pszemraj/long-t5-tglobal-base-16384-book-summary") def summarize_article(article_text): """Summarize a given article's text.""" try: if not article_text or len(article_text.split()) < 20: return None, "Article content is too short to summarize." # Ensure the input text is not too long inputs = tokenizer( article_text, return_tensors="pt", truncation=True, max_length=512, # Adjust max_length to control input size padding="max_length" ) # Generate the summary summary_ids = model.generate( **inputs, max_new_tokens=400, # Limit the length of the output min_length=100, # Set a minimum length for the output # #length_penalty='1.0', # Adjust length penalty to encourage longer output # no_repeat_ngram_size=3, # Avoid repetition of phrases early_stopping=True ) # Decode the output to get the summary summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True) return summary, None except Exception as e: return None, f"Exception during summarization: {str(e)}" # Load tokenizer and model tokenizer_t5 = AutoTokenizer.from_pretrained("scieditor/citation-generation-t5") model_t5 = AutoModelForSeq2SeqLM.from_pretrained("scieditor/citation-generation-t5") def generate_citation_t5(article_title, citation_style, article_link): """Generate a citation using the T5 or LED model.""" try: # Prepare the input text with explicit and structured formatting input_text = (f"'{article_title}'\n" f"{article_link}\n" f"Include author names, publication date, title, journal name, and DOI if available.\n" f"Generate a {citation_style} style citation for the article") # Tokenize the input inputs = tokenizer_t5(input_text, return_tensors="pt", truncation=True, padding=True) # Generate the citation outputs = model_t5.generate(**inputs, max_new_tokens=70) # Decode the output to text citation = tokenizer_t5.decode(outputs[0], skip_special_tokens=True) return citation, None except Exception as e: return None, f"Exception during citation generation: {str(e)}" from docx import Document from docx.shared import Pt from docx.oxml.ns import qn def create_thesis_document(title, summary, citations): """Create a Word document formatted like a PhD thesis.""" # Initialize Document doc = Document() # Title Page doc.add_paragraph(title, style='Title').alignment = 1 # Center alignment doc.add_paragraph() # Add empty line # Adding title page details doc.add_paragraph('Thesis', style='Heading 1').alignment = 1 doc.add_paragraph('Author Name', style='Normal').alignment = 1 doc.add_paragraph('University Name', style='Normal').alignment = 1 doc.add_paragraph('Date', style='Normal').alignment = 1 doc.add_page_break() # Summary Page doc.add_paragraph('Summary', style='Heading 1').alignment = 0 # Left alignment doc.add_paragraph(summary, style='Normal') doc.add_page_break() # Citation Page doc.add_paragraph('Citations', style='Heading 1').alignment = 0 for citation in citations: doc.add_paragraph(citation, style='Normal') file_path = "Research_Document.docx" doc.save(file_path) return file_path def research_assistant(research_topic, citation_style): """Main function to search, summarize, and generate citations.""" if not research_topic: return "Please enter a research topic.", ["No summaries generated."], ["No citations generated."] # Character limit check if len(research_topic) > 150: return "Error: Research topic exceeds 150 characters.", [], [] # Search for related articles using CrossRef articles, error = search_related_articles_crossref(research_topic) if error: return error, [], [] summaries = [] citations = [] article_content = '' for article in articles: try: # Fetching article content might not be feasible; consider using metadata article_content += f"{extract_text_from_html(article['link'])}.\n" # Simplified; actual content may require other methods citation, error = generate_citation_t5(article['title'], citation_style, article['link']) if error: citations.append(f"Error generating citation for '{article['title']}': {error}") else: citations.append(citation) except Exception as e: summaries.append(f"Error processing article '{article['title']}': {str(e)}") citations.append(f"Error generating citation for '{article['title']}': {str(e)}") summary, error = summarize_article(article_content) if error: summaries.append(f"Error summarizing article: {error}") else: summaries.append(summary) file_path = create_thesis_document(research_topic, "\n".join(summaries), citations) return research_topic, summaries, citations, file_path # Create Gradio Interface with download functionality gr_interface = gr.Interface( fn=research_assistant, inputs=[ gr.Textbox(label="Enter your research topic or question:", placeholder="Enter your research topic (max 150 characters)"), gr.Dropdown(choices=["APA", "MLA", "Chicago"], label="Choose a citation style:") ], outputs=[ gr.Textbox(label="Research Topic"), gr.Textbox(label="Summaries of Articles"), gr.Textbox(label="Generated Citations"), gr.DownloadButton(label="Download Document") ], title="AI-Powered Research Assistant", allow_flagging="never" ) gr_interface.launch(share=True)