File size: 5,186 Bytes
98069ee
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13e8d8a
98069ee
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5312841
 
98069ee
 
f26eb20
98069ee
f26eb20
98069ee
 
 
 
 
45a083d
98069ee
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
from pydantic import NoneStr
import os
from langchain.document_loaders import UnstructuredFileLoader
import mimetypes
import validators
import requests
import tempfile
import gradio as gr
import openai
import re
import urllib.parse

class WebpageSummarizer:
    """
    A class to summarize webpages using OpenAI API.
    """

    def __init__(self,):
        """
        Set OpeanApi key
        """

        openai.api_key = os.getenv("OPENAI_API_KEY")

    def upload_via_url(self, url: str) -> NoneStr:
        """
        Uploads a webpage content via URL and returns the document.

        Args:
            url (str): The URL of the webpage.

        Returns:
            NoneStr: The document content.
        """

        # Check if the URL is valid
        if validators.url(url):
            headers = {
                'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
            }

            # Send a GET request to retrieve the webpage content
            retrieve = requests.get(url, headers=headers)

            # Get the content type of the response
            content_type = retrieve.headers.get("content-type")

            # Guess the file extension based on the content type
            file_extension = mimetypes.guess_extension(content_type)

            # Save the webpage content to a temporary file
            temp_file = tempfile.NamedTemporaryFile(suffix=file_extension, delete=False)
            temp_file.write(retrieve.content)
            file_path = temp_file.name

            # Return the file path of the temporary file
            return file_path
        else:
            # If the URL is not valid, do nothing and continue
            pass



    def save_content(self, file_path: str) -> NoneStr:
        """
        Saves the content of a file at the specified file path.

        Args:
            file_path (str): The path of the file to be saved.

        Returns:
            NoneStr: The document content.
        """

        # Load the temporary file as a document using the UnstructuredFileLoader
        # strategy set to "fast" for faster processing
        loader = UnstructuredFileLoader(file_path, strategy="fast")

        # Load the document from the file
        document = loader.load()

        # Return the loaded document content
        return document

    def generate_summary(self, text: str) -> str:
        """
        Generates a summary using OpenAI API.

        Args:
            text (str): The text to be summarized.

        Returns:
            str: The generated summary.
        """

        prompt = f"Summarize the chemical related parts from given text. if text has other language return the summary as english. text: {text}"
        # Make an API call to generate a summary using OpenAI API
        response = openai.Completion.create(
            model="text-davinci-003",
            prompt=prompt,
            temperature=0,
            max_tokens=500,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0.6,
        )

        message = response.choices[0].text.strip()
        return message

    def summarize_webpage(self, url: str) -> str:
        """
        Summarizes a webpage using OpenAI API.

        Args:
            url (str): The URL of the webpage.

        Returns:
            str: The generated summary.
        """
        try:
            # Upload the webpage content and retrieve the temporary file path
            temporary_file_path = self.upload_via_url(url)

            # Save the content of the temporary file
            document_content = self.save_content(temporary_file_path)

            # Generate a summary using the document content
            summary = self.generate_summary(document_content)

            # Return the generated summary
            return summary
        except:
            # If an exception occurs (e.g., invalid URL), return an error message
            return "Please enter a valid URL."

    def gradio_interface(self):
        # Create a Gradio interface for the webpage summarization
        with gr.Blocks(css="style.css", theme=gr.themes.Soft()) as demo:
            gr.HTML("""<img class="leftimage" align="left" src="https://templates.images.credential.net/1612472097627370951721412474196.png" alt="Image" width="210" height="210">
                  <img class="rightimage" align="right" src="https://logos-download.com/wp-content/uploads/2016/06/Syngenta_logo.png" alt="Image" width="150" height="140">""")
            with gr.Row():
                with gr.Column(elem_id="col-container"):
                    gr.HTML("""<center><h1>Syngenta Chemical Identifier</h1></center>""")
                    inputs = gr.Textbox(label="URL")
                    btn = gr.Button(label="Submit",value = "Analyse")
                    outputs = gr.Textbox(label="Summary", lines=6)

            btn.click(fn=self.summarize_webpage, inputs=inputs, outputs=outputs)

        # Launch the Gradio interface
        demo.launch()


if __name__ == "__main__":
    web_scraper = WebpageSummarizer()
    web_scraper.gradio_interface()