File size: 8,415 Bytes
70c2b2c
 
abf24f3
 
 
 
 
70c2b2c
 
abf24f3
70c2b2c
 
 
 
 
 
 
 
 
 
e736148
baaf744
70c2b2c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
abf24f3
 
70c2b2c
 
 
 
 
 
 
8fc80cf
70c2b2c
 
 
 
 
 
 
 
8fc80cf
70c2b2c
abf24f3
70c2b2c
 
abf24f3
70c2b2c
 
 
 
8fc80cf
70c2b2c
 
 
 
 
 
abf24f3
 
70c2b2c
 
 
 
 
 
 
 
 
 
 
abf24f3
70c2b2c
 
abf24f3
70c2b2c
 
abf24f3
70c2b2c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9205504
70c2b2c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9205504
 
70c2b2c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e736148
 
 
 
 
70c2b2c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9205504
70c2b2c
 
 
 
 
 
 
 
 
 
 
 
abf24f3
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
# -*- coding: utf-8 -*-
"""AI-Powered Research Assistant for Scholars and Researchers.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1u8Qvn2TOmOr1hZ_BovZPUA3KCku31OXb
"""

# !pip install gradio requests transformers beautifulsoup4 python-docx torch

"""**Set Up the Environment:** Install the required libraries

**Create the Gradio Frontend:** searching for articles, summarizing content, generating citations
"""

import gradio as gr
import requests
from transformers import pipeline

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

def search_related_articles_crossref(query, max_results=3):
    """Search for related articles using CrossRef API."""
    try:
        url = f"https://api.crossref.org/works?query={query}&rows={max_results}"
        headers = {"User-Agent": "AI-Powered Research Assistant ([email protected])"}  # Replace with your email
        response = requests.get(url, headers=headers)

        if response.status_code == 200:
            articles = []
            data = response.json()
            for item in data['message']['items']:
                title = item.get('title', ['No Title'])[0]
                doi = item.get('DOI', 'No DOI')
                link = f"https://doi.org/{doi}"
                articles.append({"title": title, "link": link})

            print(articles)
            if not articles:
                print(articles)
                return [], "No articles found for the query."
            return articles, None
        else:
            return [], f"Error fetching articles: {response.status_code} - {response.text}"
    except Exception as e:
        return [], f"Exception during CrossRef API call: {str(e)}"

from bs4 import BeautifulSoup

def extract_text_from_html(url):
    """Extract text content from HTML page."""
    try:
        response = requests.get(url)
        response.raise_for_status()  # Check for request errors
        soup = BeautifulSoup(response.text, 'html.parser')

        # This is a simplified example. You may need to adjust the selector based on the site structure.
        paragraphs = soup.find_all('p')
        text_content = "\n".join([para.get_text() for para in paragraphs])

        return text_content
    except Exception as e:
        return f"Error extracting text: {str(e)}"

tokenizer = AutoTokenizer.from_pretrained("pszemraj/long-t5-tglobal-base-16384-book-summary")
model = AutoModelForSeq2SeqLM.from_pretrained("pszemraj/long-t5-tglobal-base-16384-book-summary")

def summarize_article(article_text):
    """Summarize a given article's text."""
    try:
        if not article_text or len(article_text.split()) < 20:
            return None, "Article content is too short to summarize."
        # Ensure the input text is not too long
        inputs = tokenizer(
            article_text,
            return_tensors="pt",
            truncation=True,
            max_length=512,  # Adjust max_length to control input size
            padding="max_length"
        )

        # Generate the summary
        summary_ids = model.generate(
            **inputs,
            max_new_tokens=400,  # Limit the length of the output
            min_length=100,      # Set a minimum length for the output
            # #length_penalty='1.0',  # Adjust length penalty to encourage longer output
            # no_repeat_ngram_size=3,    # Avoid repetition of phrases
            early_stopping=True
        )

        # Decode the output to get the summary
        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

        return summary, None
    except Exception as e:
        return None, f"Exception during summarization: {str(e)}"

# Load tokenizer and model
tokenizer_t5 = AutoTokenizer.from_pretrained("scieditor/citation-generation-t5")
model_t5 = AutoModelForSeq2SeqLM.from_pretrained("scieditor/citation-generation-t5")

def generate_citation_t5(article_title, citation_style, article_link):
    """Generate a citation using the T5 or LED model."""
    try:
        # Prepare the input text with explicit and structured formatting
        input_text = (f"'{article_title}'\n"
                      f"{article_link}\n"
                      f"Include author names, publication date, title, journal name, and DOI if available.\n"
                      f"Generate a {citation_style} style citation for the article")

        # Tokenize the input
        inputs = tokenizer_t5(input_text, return_tensors="pt", truncation=True, padding=True)

        # Generate the citation
        outputs = model_t5.generate(**inputs, max_new_tokens=70)

        # Decode the output to text
        citation = tokenizer_t5.decode(outputs[0], skip_special_tokens=True)
        return citation, None
    except Exception as e:
        return None, f"Exception during citation generation: {str(e)}"

from docx import Document
from docx.shared import Pt
from docx.oxml.ns import qn

def create_thesis_document(title, summary, citations):
    """Create a Word document formatted like a PhD thesis."""

    # Initialize Document
    doc = Document()

    # Title Page
    doc.add_paragraph(title, style='Title').alignment = 1  # Center alignment
    doc.add_paragraph()  # Add empty line

    # Adding title page details
    doc.add_paragraph('Thesis', style='Heading 1').alignment = 1
    doc.add_paragraph('Author Name', style='Normal').alignment = 1
    doc.add_paragraph('University Name', style='Normal').alignment = 1
    doc.add_paragraph('Date', style='Normal').alignment = 1

    doc.add_page_break()

    # Summary Page
    doc.add_paragraph('Summary', style='Heading 1').alignment = 0  # Left alignment
    doc.add_paragraph(summary, style='Normal')

    doc.add_page_break()

    # Citation Page
    doc.add_paragraph('Citations', style='Heading 1').alignment = 0

    for citation in citations:
        doc.add_paragraph(citation, style='Normal')

    file_path = "Research_Document.docx"
    doc.save(file_path)
    return file_path

def research_assistant(research_topic, citation_style):
    """Main function to search, summarize, and generate citations."""
    if not research_topic:
        return "Please enter a research topic.", ["No summaries generated."], ["No citations generated."]

    # Character limit check
    if len(research_topic) > 150:
        return "Error: Research topic exceeds 150 characters.", [], []

    # Search for related articles using CrossRef
    articles, error = search_related_articles_crossref(research_topic)

    if error:
        return error, [], []

    summaries = []
    citations = []
    article_content = ''

    for article in articles:
        try:
            # Fetching article content might not be feasible; consider using metadata
            article_content += f"{extract_text_from_html(article['link'])}.\n"  # Simplified; actual content may require other methods

            citation, error = generate_citation_t5(article['title'], citation_style, article['link'])
            if error:
                citations.append(f"Error generating citation for '{article['title']}': {error}")
            else:
                citations.append(citation)

        except Exception as e:
            summaries.append(f"Error processing article '{article['title']}': {str(e)}")
            citations.append(f"Error generating citation for '{article['title']}': {str(e)}")

    summary, error = summarize_article(article_content)
    if error:
        summaries.append(f"Error summarizing article: {error}")
    else:
        summaries.append(summary)

    file_path = create_thesis_document(research_topic, "\n".join(summaries), citations)
    return research_topic, summaries, citations, file_path

# Create Gradio Interface with download functionality
gr_interface = gr.Interface(
    fn=research_assistant,
    inputs=[
        gr.Textbox(label="Enter your research topic or question:", placeholder="Enter your research topic (max 150 characters)"),
        gr.Dropdown(choices=["APA", "MLA", "Chicago"], label="Choose a citation style:")
    ],
    outputs=[
        gr.Textbox(label="Research Topic"),
        gr.Textbox(label="Summaries of Articles"),
        gr.Textbox(label="Generated Citations"),
        gr.DownloadButton(label="Download Document")
    ],
    title="AI-Powered Research Assistant",
    allow_flagging="never"
)

gr_interface.launch(share=True)