Spaces:

syngent
/

URL_chemical_identifier

Runtime error

File size: 5,186 Bytes

from pydantic import NoneStr
import os
from langchain.document_loaders import UnstructuredFileLoader
import mimetypes
import validators
import requests
import tempfile
import gradio as gr
import openai
import re
import urllib.parse

class WebpageSummarizer:
    """
    A class to summarize webpages using OpenAI API.
    """

    def __init__(self,):
        """
        Set OpeanApi key
        """

        openai.api_key = os.getenv("OPENAI_API_KEY")

    def upload_via_url(self, url: str) -> NoneStr:
        """
        Uploads a webpage content via URL and returns the document.

        Args:
            url (str): The URL of the webpage.

        Returns:
            NoneStr: The document content.
        """

        # Check if the URL is valid
        if validators.url(url):
            headers = {
                'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
            }

            # Send a GET request to retrieve the webpage content
            retrieve = requests.get(url, headers=headers)

            # Get the content type of the response
            content_type = retrieve.headers.get("content-type")

            # Guess the file extension based on the content type
            file_extension = mimetypes.guess_extension(content_type)

            # Save the webpage content to a temporary file
            temp_file = tempfile.NamedTemporaryFile(suffix=file_extension, delete=False)
            temp_file.write(retrieve.content)
            file_path = temp_file.name

            # Return the file path of the temporary file
            return file_path
        else:
            # If the URL is not valid, do nothing and continue
            pass



    def save_content(self, file_path: str) -> NoneStr:
        """
        Saves the content of a file at the specified file path.

        Args:
            file_path (str): The path of the file to be saved.

        Returns:
            NoneStr: The document content.
        """

        # Load the temporary file as a document using the UnstructuredFileLoader
        # strategy set to "fast" for faster processing
        loader = UnstructuredFileLoader(file_path, strategy="fast")

        # Load the document from the file
        document = loader.load()

        # Return the loaded document content
        return document

    def generate_summary(self, text: str) -> str:
        """
        Generates a summary using OpenAI API.

        Args:
            text (str): The text to be summarized.

        Returns:
            str: The generated summary.
        """

        prompt = f"Summarize the chemical related parts from given text. if text has other language return the summary as english. text: {text}"
        # Make an API call to generate a summary using OpenAI API
        response = openai.Completion.create(
            model="text-davinci-003",
            prompt=prompt,
            temperature=0,
            max_tokens=500,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0.6,
        )

        message = response.choices[0].text.strip()
        return message

    def summarize_webpage(self, url: str) -> str:
        """
        Summarizes a webpage using OpenAI API.

        Args:
            url (str): The URL of the webpage.

        Returns:
            str: The generated summary.
        """
        try:
            # Upload the webpage content and retrieve the temporary file path
            temporary_file_path = self.upload_via_url(url)

            # Save the content of the temporary file
            document_content = self.save_content(temporary_file_path)

            # Generate a summary using the document content
            summary = self.generate_summary(document_content)

            # Return the generated summary
            return summary
        except:
            # If an exception occurs (e.g., invalid URL), return an error message
            return "Please enter a valid URL."

    def gradio_interface(self):
        # Create a Gradio interface for the webpage summarization
        with gr.Blocks(css="style.css", theme=gr.themes.Soft()) as demo:
            gr.HTML("""<img class="leftimage" align="left" src="https://templates.images.credential.net/1612472097627370951721412474196.png" alt="Image" width="210" height="210">
                  <img class="rightimage" align="right" src="https://logos-download.com/wp-content/uploads/2016/06/Syngenta_logo.png" alt="Image" width="150" height="140">""")
            with gr.Row():
                with gr.Column(elem_id="col-container"):
                    gr.HTML("""<center><h1>Syngenta Chemical Identifier</h1></center>""")
                    inputs = gr.Textbox(label="URL")
                    btn = gr.Button(label="Submit",value = "Analyse")
                    outputs = gr.Textbox(label="Summary", lines=6)

            btn.click(fn=self.summarize_webpage, inputs=inputs, outputs=outputs)

        # Launch the Gradio interface
        demo.launch()


if __name__ == "__main__":
    web_scraper = WebpageSummarizer()
    web_scraper.gradio_interface()