File size: 3,297 Bytes
3505899
 
 
 
 
 
 
 
 
 
 
 
f93d314
3505899
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6678ed6
d770ec6
6678ed6
d770ec6
 
 
 
 
 
 
 
 
 
6678ed6
d770ec6
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
from AssistantService import GPTAssistant
from openai.error import AuthenticationError
import streamlit as st
import configparser

config = configparser.ConfigParser()
config.read('config.ini')
if 'DEFAULT' in config:
    assistant_api_key = config['DEFAULT'].get('API-KEY', '')

st.title("Web Scraping Assistant")
st.write("This app helps you to extract data from HTML code using web scraping. It uses GPT-3.5-turbo to generate the code for you.")
st.write("Contribute to this project on [GitHub](https://github.com/CognitiveLabs/GPT-auto-webscraping)")
if assistant_api_key == '':
    assistant_api_key = st.text_input("Paste your API key here:")
    if assistant_api_key:
        gpt_assistant = GPTAssistant(assistant_api_key)
else:
    gpt_assistant = GPTAssistant(assistant_api_key)

html_content = st.text_input("Paste your piece of HTML here:")

if html_content:
    if st.button("Extract data format"):
        try:
            output = gpt_assistant.chain_response_format(html_content)
            st.session_state['output_format'] = output
        except NameError:
            st.write("Complete the API key field")
        except AuthenticationError:
            st.write("Invalid API key")

if 'output_format' in st.session_state:
    output_format = st.code(st.session_state['output_format'], language="json")
    
    if st.button("Generate the code"):
        try:
            python_code = gpt_assistant.chain_code_generator(st.session_state['output_format'], html_content)
            st.session_state['code_generated'] = python_code
            st.session_state['code_generated_exec'] = python_code + "\nresult = extract_info(html_data)"

        except NameError:
            st.write("Complete the API key field")
        except AuthenticationError:
            st.write("Invalid API key")


if 'code_generated' in st.session_state:
    python_function_label = st.write("Here is your python function:")
    code_generated = st.code(st.session_state['code_generated'],language="python")
    full_content = st.text_input("Paste your complete HTML here:")
    if full_content and st.button("Test the code"):
        html_data = full_content
        result = None
        exec(st.session_state['code_generated_exec'], globals())
        if result:
            st.write("data extracted successfully")
            # show data in table
            st.table(result)
        else:
            st.write("error extracting data")

st.title("How to use this app")

st.write("1. Paste the html code of your target element in the first text box and press \"Enter\"")
example = st.button("Show example")
if example:
    example = False
    text_area = st.text_area("Example", value='<li><div class="product"> <h3 class="title">Product 1</h3> <p class="description">This is the description of the product 1</p> <span class="price">10.00</span> </div></li>')
    close_example = st.button("Close example")
    if close_example:
        example = False
        close_example.disabled = True
        text_area = None

st.write("2. Click on the button 'Extract data format'")

st.write("3. Click on the button 'Generate the code'")

st.write("4. Paste the complete html code in the last text box to test the auto generated code")

st.write("5. Copy the code and include it in your own projects")