Spaces:

smfaiz
/

research-assistant

Sleeping

File size: 8,415 Bytes

# -*- coding: utf-8 -*-
"""AI-Powered Research Assistant for Scholars and Researchers.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1u8Qvn2TOmOr1hZ_BovZPUA3KCku31OXb
"""

# !pip install gradio requests transformers beautifulsoup4 python-docx torch

"""**Set Up the Environment:** Install the required libraries

**Create the Gradio Frontend:** searching for articles, summarizing content, generating citations
"""

import gradio as gr
import requests
from transformers import pipeline

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

def search_related_articles_crossref(query, max_results=3):
    """Search for related articles using CrossRef API."""
    try:
        url = f"https://api.crossref.org/works?query={query}&rows={max_results}"
        headers = {"User-Agent": "AI-Powered Research Assistant ([email protected])"}  # Replace with your email
        response = requests.get(url, headers=headers)

        if response.status_code == 200:
            articles = []
            data = response.json()
            for item in data['message']['items']:
                title = item.get('title', ['No Title'])[0]
                doi = item.get('DOI', 'No DOI')
                link = f"https://doi.org/{doi}"
                articles.append({"title": title, "link": link})

            print(articles)
            if not articles:
                print(articles)
                return [], "No articles found for the query."
            return articles, None
        else:
            return [], f"Error fetching articles: {response.status_code} - {response.text}"
    except Exception as e:
        return [], f"Exception during CrossRef API call: {str(e)}"

from bs4 import BeautifulSoup

def extract_text_from_html(url):
    """Extract text content from HTML page."""
    try:
        response = requests.get(url)
        response.raise_for_status()  # Check for request errors
        soup = BeautifulSoup(response.text, 'html.parser')

        # This is a simplified example. You may need to adjust the selector based on the site structure.
        paragraphs = soup.find_all('p')
        text_content = "\n".join([para.get_text() for para in paragraphs])

        return text_content
    except Exception as e:
        return f"Error extracting text: {str(e)}"

tokenizer = AutoTokenizer.from_pretrained("pszemraj/long-t5-tglobal-base-16384-book-summary")
model = AutoModelForSeq2SeqLM.from_pretrained("pszemraj/long-t5-tglobal-base-16384-book-summary")

def summarize_article(article_text):
    """Summarize a given article's text."""
    try:
        if not article_text or len(article_text.split()) < 20:
            return None, "Article content is too short to summarize."
        # Ensure the input text is not too long
        inputs = tokenizer(
            article_text,
            return_tensors="pt",
            truncation=True,
            max_length=512,  # Adjust max_length to control input size
            padding="max_length"
        )

        # Generate the summary
        summary_ids = model.generate(
            **inputs,
            max_new_tokens=400,  # Limit the length of the output
            min_length=100,      # Set a minimum length for the output
            # #length_penalty='1.0',  # Adjust length penalty to encourage longer output
            # no_repeat_ngram_size=3,    # Avoid repetition of phrases
            early_stopping=True
        )

        # Decode the output to get the summary
        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

        return summary, None
    except Exception as e:
        return None, f"Exception during summarization: {str(e)}"

# Load tokenizer and model
tokenizer_t5 = AutoTokenizer.from_pretrained("scieditor/citation-generation-t5")
model_t5 = AutoModelForSeq2SeqLM.from_pretrained("scieditor/citation-generation-t5")

def generate_citation_t5(article_title, citation_style, article_link):
    """Generate a citation using the T5 or LED model."""
    try:
        # Prepare the input text with explicit and structured formatting
        input_text = (f"'{article_title}'\n"
                      f"{article_link}\n"
                      f"Include author names, publication date, title, journal name, and DOI if available.\n"
                      f"Generate a {citation_style} style citation for the article")

        # Tokenize the input
        inputs = tokenizer_t5(input_text, return_tensors="pt", truncation=True, padding=True)

        # Generate the citation
        outputs = model_t5.generate(**inputs, max_new_tokens=70)

        # Decode the output to text
        citation = tokenizer_t5.decode(outputs[0], skip_special_tokens=True)
        return citation, None
    except Exception as e:
        return None, f"Exception during citation generation: {str(e)}"

from docx import Document
from docx.shared import Pt
from docx.oxml.ns import qn

def create_thesis_document(title, summary, citations):
    """Create a Word document formatted like a PhD thesis."""

    # Initialize Document
    doc = Document()

    # Title Page
    doc.add_paragraph(title, style='Title').alignment = 1  # Center alignment
    doc.add_paragraph()  # Add empty line

    # Adding title page details
    doc.add_paragraph('Thesis', style='Heading 1').alignment = 1
    doc.add_paragraph('Author Name', style='Normal').alignment = 1
    doc.add_paragraph('University Name', style='Normal').alignment = 1
    doc.add_paragraph('Date', style='Normal').alignment = 1

    doc.add_page_break()

    # Summary Page
    doc.add_paragraph('Summary', style='Heading 1').alignment = 0  # Left alignment
    doc.add_paragraph(summary, style='Normal')

    doc.add_page_break()

    # Citation Page
    doc.add_paragraph('Citations', style='Heading 1').alignment = 0

    for citation in citations:
        doc.add_paragraph(citation, style='Normal')

    file_path = "Research_Document.docx"
    doc.save(file_path)
    return file_path

def research_assistant(research_topic, citation_style):
    """Main function to search, summarize, and generate citations."""
    if not research_topic:
        return "Please enter a research topic.", ["No summaries generated."], ["No citations generated."]

    # Character limit check
    if len(research_topic) > 150:
        return "Error: Research topic exceeds 150 characters.", [], []

    # Search for related articles using CrossRef
    articles, error = search_related_articles_crossref(research_topic)

    if error:
        return error, [], []

    summaries = []
    citations = []
    article_content = ''

    for article in articles:
        try:
            # Fetching article content might not be feasible; consider using metadata
            article_content += f"{extract_text_from_html(article['link'])}.\n"  # Simplified; actual content may require other methods

            citation, error = generate_citation_t5(article['title'], citation_style, article['link'])
            if error:
                citations.append(f"Error generating citation for '{article['title']}': {error}")
            else:
                citations.append(citation)

        except Exception as e:
            summaries.append(f"Error processing article '{article['title']}': {str(e)}")
            citations.append(f"Error generating citation for '{article['title']}': {str(e)}")

    summary, error = summarize_article(article_content)
    if error:
        summaries.append(f"Error summarizing article: {error}")
    else:
        summaries.append(summary)

    file_path = create_thesis_document(research_topic, "\n".join(summaries), citations)
    return research_topic, summaries, citations, file_path

# Create Gradio Interface with download functionality
gr_interface = gr.Interface(
    fn=research_assistant,
    inputs=[
        gr.Textbox(label="Enter your research topic or question:", placeholder="Enter your research topic (max 150 characters)"),
        gr.Dropdown(choices=["APA", "MLA", "Chicago"], label="Choose a citation style:")
    ],
    outputs=[
        gr.Textbox(label="Research Topic"),
        gr.Textbox(label="Summaries of Articles"),
        gr.Textbox(label="Generated Citations"),
        gr.DownloadButton(label="Download Document")
    ],
    title="AI-Powered Research Assistant",
    allow_flagging="never"
)

gr_interface.launch(share=True)