Spaces:
Runtime error
Runtime error
File size: 5,186 Bytes
98069ee 13e8d8a 98069ee 5312841 98069ee f26eb20 98069ee f26eb20 98069ee 45a083d 98069ee |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 |
from pydantic import NoneStr
import os
from langchain.document_loaders import UnstructuredFileLoader
import mimetypes
import validators
import requests
import tempfile
import gradio as gr
import openai
import re
import urllib.parse
class WebpageSummarizer:
"""
A class to summarize webpages using OpenAI API.
"""
def __init__(self,):
"""
Set OpeanApi key
"""
openai.api_key = os.getenv("OPENAI_API_KEY")
def upload_via_url(self, url: str) -> NoneStr:
"""
Uploads a webpage content via URL and returns the document.
Args:
url (str): The URL of the webpage.
Returns:
NoneStr: The document content.
"""
# Check if the URL is valid
if validators.url(url):
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
}
# Send a GET request to retrieve the webpage content
retrieve = requests.get(url, headers=headers)
# Get the content type of the response
content_type = retrieve.headers.get("content-type")
# Guess the file extension based on the content type
file_extension = mimetypes.guess_extension(content_type)
# Save the webpage content to a temporary file
temp_file = tempfile.NamedTemporaryFile(suffix=file_extension, delete=False)
temp_file.write(retrieve.content)
file_path = temp_file.name
# Return the file path of the temporary file
return file_path
else:
# If the URL is not valid, do nothing and continue
pass
def save_content(self, file_path: str) -> NoneStr:
"""
Saves the content of a file at the specified file path.
Args:
file_path (str): The path of the file to be saved.
Returns:
NoneStr: The document content.
"""
# Load the temporary file as a document using the UnstructuredFileLoader
# strategy set to "fast" for faster processing
loader = UnstructuredFileLoader(file_path, strategy="fast")
# Load the document from the file
document = loader.load()
# Return the loaded document content
return document
def generate_summary(self, text: str) -> str:
"""
Generates a summary using OpenAI API.
Args:
text (str): The text to be summarized.
Returns:
str: The generated summary.
"""
prompt = f"Summarize the chemical related parts from given text. if text has other language return the summary as english. text: {text}"
# Make an API call to generate a summary using OpenAI API
response = openai.Completion.create(
model="text-davinci-003",
prompt=prompt,
temperature=0,
max_tokens=500,
top_p=1,
frequency_penalty=0,
presence_penalty=0.6,
)
message = response.choices[0].text.strip()
return message
def summarize_webpage(self, url: str) -> str:
"""
Summarizes a webpage using OpenAI API.
Args:
url (str): The URL of the webpage.
Returns:
str: The generated summary.
"""
try:
# Upload the webpage content and retrieve the temporary file path
temporary_file_path = self.upload_via_url(url)
# Save the content of the temporary file
document_content = self.save_content(temporary_file_path)
# Generate a summary using the document content
summary = self.generate_summary(document_content)
# Return the generated summary
return summary
except:
# If an exception occurs (e.g., invalid URL), return an error message
return "Please enter a valid URL."
def gradio_interface(self):
# Create a Gradio interface for the webpage summarization
with gr.Blocks(css="style.css", theme=gr.themes.Soft()) as demo:
gr.HTML("""<img class="leftimage" align="left" src="https://templates.images.credential.net/1612472097627370951721412474196.png" alt="Image" width="210" height="210">
<img class="rightimage" align="right" src="https://logos-download.com/wp-content/uploads/2016/06/Syngenta_logo.png" alt="Image" width="150" height="140">""")
with gr.Row():
with gr.Column(elem_id="col-container"):
gr.HTML("""<center><h1>Syngenta Chemical Identifier</h1></center>""")
inputs = gr.Textbox(label="URL")
btn = gr.Button(label="Submit",value = "Analyse")
outputs = gr.Textbox(label="Summary", lines=6)
btn.click(fn=self.summarize_webpage, inputs=inputs, outputs=outputs)
# Launch the Gradio interface
demo.launch()
if __name__ == "__main__":
web_scraper = WebpageSummarizer()
web_scraper.gradio_interface()
|