Spaces:

smfaiz
/

research-assistant

Sleeping

App Files Files Community

smfaiz commited on Sep 1, 2024

Commit

70c2b2c

verified ·

1 Parent(s): c6a33c8

Create app.py

Browse files

Files changed (1) hide show

app.py +241 -0

app.py ADDED Viewed

	@@ -0,0 +1,241 @@

+# -*- coding: utf-8 -*-
+"""AI-Powered Research Assistant for Scholars and Researchers.ipynb
+Automatically generated by Colab.
+Original file is located at
+    https://colab.research.google.com/drive/1u8Qvn2TOmOr1hZ_BovZPUA3KCku31OXb
+"""
+!pip install gradio requests transformers beautifulsoup4 python-docx torch
+"""**Set Up the Environment:** Install the required libraries
+**Create the Gradio Frontend:** searching for articles, summarizing content, generating citations
+"""
+import gradio as gr
+import requests
+from transformers import pipeline
+# Initialize Hugging Face Summarization and Text Generation Pipelines
+summarizer = pipeline("summarization", model="scieditor/citation-generation-t5")
+citation_generator = pipeline("text-generation", model="gpt2")
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+def search_related_articles_crossref(query, max_results=3):
+    """Search for related articles using CrossRef API."""
+    try:
+        url = f"https://api.crossref.org/works?query={query}&rows={max_results}"
+        headers = {"User-Agent": "AI-Powered Research Assistant ([email protected])"}  # Replace with your email
+        response = requests.get(url, headers=headers)
+        if response.status_code == 200:
+            articles = []
+            data = response.json()
+            for item in data['message']['items']:
+                title = item.get('title', ['No Title'])[0]
+                doi = item.get('DOI', 'No DOI')
+                link = f"https://doi.org/{doi}"
+                articles.append({"title": title, "link": link})
+            print(articles)
+            if not articles:
+                print(articles)
+                return [], "No articles found for the query."
+            return articles, None
+        else:
+            return [], f"Error fetching articles: {response.status_code} - {response.text}"
+    except Exception as e:
+        return [], f"Exception during CrossRef API call: {str(e)}"
+from bs4 import BeautifulSoup
+def extract_text_from_html(url):
+    """Extract text content from HTML page."""
+    try:
+        response = requests.get(url)
+        response.raise_for_status()  # Check for request errors
+        soup = BeautifulSoup(response.text, 'html.parser')
+        # This is a simplified example. You may need to adjust the selector based on the site structure.
+        paragraphs = soup.find_all('p')
+        text_content = "\n".join([para.get_text() for para in paragraphs])
+        return text_content
+    except Exception as e:
+        return f"Error extracting text: {str(e)}"
+tokenizer_s = AutoTokenizer.from_pretrained("pszemraj/long-t5-tglobal-base-16384-book-summary")
+model_s = AutoModelForSeq2SeqLM.from_pretrained("pszemraj/long-t5-tglobal-base-16384-book-summary")
+def summarize_article(article_text):
+    """Summarize a given article's text."""
+    try:
+        if not article_text or len(article_text.split()) < 20:
+            return None, "Article content is too short to summarize."
+        # Ensure the input text is not too long
+        inputs = tokenizer_s(
+            article_text,
+            return_tensors="pt",
+            truncation=True,
+            max_length=512,  # Adjust max_length to control input size
+            padding="max_length"
+        )
+        # Generate the summary
+        summary_ids = model_s.generate(
+            **inputs,
+            max_new_tokens=400,  # Limit the length of the output
+            min_length=100,      # Set a minimum length for the output
+            # #length_penalty='1.0',  # Adjust length penalty to encourage longer output
+            # no_repeat_ngram_size=3,    # Avoid repetition of phrases
+            early_stopping=True
+        )
+        # Decode the output to get the summary
+        summary = tokenizer_s.decode(summary_ids[0], skip_special_tokens=True)
+        return summary, None
+    except Exception as e:
+        return None, f"Exception during summarization: {str(e)}"
+# Load tokenizer and model
+tokenizer = AutoTokenizer.from_pretrained("scieditor/citation-generation-t5")
+model = AutoModelForSeq2SeqLM.from_pretrained("scieditor/citation-generation-t5")
+def generate_citation_t5(article_title, citation_style, article_link):
+    """Generate a citation using the T5 or LED model."""
+    try:
+        # Prepare the input text with explicit and structured formatting
+        input_text = (f"'{article_title}'\n"
+                      f"{article_link}\n"
+                      f"Include author names, publication date, title, journal name, and DOI if available.\n"
+                      f"Generate a {citation_style} style citation for the article")
+        # Tokenize the input
+        inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True)
+        # Generate the citation
+        outputs = model.generate(**inputs, max_new_tokens=70)
+        # Decode the output to text
+        citation = tokenizer.decode(outputs[0], skip_special_tokens=True)
+        return citation, None
+    except Exception as e:
+        return None, f"Exception during citation generation: {str(e)}"
+from docx import Document
+from docx.shared import Pt
+from docx.oxml.ns import qn
+def create_thesis_document(title, summary, citations):
+    """Create a Word document formatted like a PhD thesis."""
+    # Initialize Document
+    doc = Document()
+    # Title Page
+    doc.add_paragraph(title, style='Title').alignment = 1  # Center alignment
+    doc.add_paragraph()  # Add empty line
+    # Adding more title page details
+    doc.add_paragraph('Thesis', style='Heading 1').alignment = 1
+    doc.add_paragraph('Author Name', style='Normal').alignment = 1
+    doc.add_paragraph('University Name', style='Normal').alignment = 1
+    doc.add_paragraph('Date', style='Normal').alignment = 1
+    doc.add_page_break()
+    # Summary Page
+    doc.add_paragraph('Summary', style='Heading 1').alignment = 0  # Left alignment
+    doc.add_paragraph(summary, style='Normal')
+    doc.add_page_break()
+    # Citation Page
+    doc.add_paragraph('Citations', style='Heading 1').alignment = 0
+    for citation in citations:
+        doc.add_paragraph(citation, style='Normal')
+    # Save Document
+    # doc.save(f"Research_Document.docx")
+    file_path = "Research_Document.docx"
+    doc.save(file_path)
+    return file_path
+# Example Usage
+# title = "Federated Learning\nA Comprehensive Study"
+# summary = "This thesis explores federated learning techniques including quantum natural gradient descent and their applications in various domains. Detailed analysis and results are provided."
+# citations = [
+#     "Federated quantum natural gradient descent for quantum federated learning. DOI: https://doi.org/10.1016/b978-0-44-319037-7.00028-4",
+#     "Federated Machine Learning for Systems Medicine. DOI: https://doi.org/10.14293/gof.23.06",
+#     "Adversarial robustness in federated learning. DOI: https://doi.org/10.1016/b978-0-44-319037-7.00013-2"
+# ]
+# create_thesis_document(title, summary, citations)
+def research_assistant(research_topic, citation_style):
+    """Main function to search, summarize, and generate citations."""
+    if not research_topic:
+        return "Please enter a research topic.", ["No summaries generated."], ["No citations generated."]
+    # Character limit check
+    if len(research_topic) > 75:
+        return "Error: Research topic exceeds 75 characters.", [], []
+    # Search for related articles using CrossRef
+    articles, error = search_related_articles_crossref(research_topic)
+    if error:
+        return error, [], []
+    summaries = []
+    citations = []
+    article_content = ''
+    for article in articles:
+        try:
+            # Fetching article content might not be feasible; consider using metadata
+            article_content += f"{extract_text_from_html(article['link'])}.\n"  # Simplified; actual content may require other methods
+            citation, error = generate_citation_t5(article['title'], citation_style, article['link'])
+            if error:
+                citations.append(f"Error generating citation for '{article['title']}': {error}")
+            else:
+                citations.append(citation)
+        except Exception as e:
+            summaries.append(f"Error processing article '{article['title']}': {str(e)}")
+            citations.append(f"Error generating citation for '{article['title']}': {str(e)}")
+    summary, error = summarize_article(article_content)
+    if error:
+        summaries.append(f"Error summarizing article: {error}")
+    else:
+        summaries.append(summary)
+    file_path = create_thesis_document(research_topic, "\n".join(summaries), citations)
+    return research_topic, summaries, citations, file_path
+# Create Gradio Interface with download functionality
+gr_interface = gr.Interface(
+    fn=research_assistant,
+    inputs=[
+        gr.Textbox(label="Enter your research topic or question:", placeholder="Enter your research topic (max 75 characters)", lines=2),
+        gr.Dropdown(choices=["APA", "MLA", "Chicago"], label="Choose a citation style:")
+    ],
+    outputs=[
+        gr.Textbox(label="Research Topic"),
+        gr.Textbox(label="Summaries of Articles"),
+        gr.Textbox(label="Generated Citations"),
+        gr.DownloadButton(label="Download Document")
+    ],
+    title="AI-Powered Research Assistant",
+    allow_flagging="never"
+)
+gr_interface.launch(debug=True)