File size: 2,995 Bytes
3505899
 
 
 
d6579b5
3505899
 
 
 
 
 
d6579b5
 
 
 
 
f9159cc
 
c8114a5
 
cdc5bd9
c8114a5
39b45c6
a7e7db9
39b45c6
 
 
 
 
 
c8114a5
3505899
 
 
 
 
 
 
d19e286
23b48d0
 
 
 
490445b
 
 
 
 
 
 
 
3505899
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
490445b
 
3505899
 
 
 
 
 
 
 
 
6678ed6
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
from AssistantService import GPTAssistant
from openai.error import AuthenticationError
import streamlit as st
import configparser
import os

config = configparser.ConfigParser()
config.read('config.ini')
if 'DEFAULT' in config:
    assistant_api_key = config['DEFAULT'].get('API-KEY', '')

os.environ["LANGCHAIN_TRACING_V2"]="true"
os.environ["LANGCHAIN_ENDPOINT"]="https://api.smith.langchain.com"
os.environ["LANGCHAIN_API_KEY"]=st.secrets["LANGCHAIN_API_KEY"]
os.environ["LANGCHAIN_PROJECT"]=st.secrets["LANGCHAIN_PROJECT"]

st.write("This app helps you to extract data from HTML code using web scraping. It uses GPT-3.5-turbo to generate the code for you. \n *Contribute to this project on [GitHub](https://github.com/CognitiveLabs/GPT-auto-webscraping)*")
st.write("")

with st.expander(label="Check out the video demo"):
    yt_video = st.video("https://www.youtube.com/watch?v=_zeCun4OlCc")

info_text = """
**Quick start** \n
Fill the input with the HTML code you want to extract data from
Example below:
"""
st.write(info_text)
st.image("https://j.gifs.com/gpqvPl.gif")


if assistant_api_key == '':
    assistant_api_key = st.text_input("Paste your API key here:")
    if assistant_api_key:
        gpt_assistant = GPTAssistant(assistant_api_key)
else:
    gpt_assistant = GPTAssistant(assistant_api_key)


html_content = st.text_input("Paste your piece of HTML here:")

extract_button = st.button("Extract data format")

if html_content and extract_button:
    try:
        output = gpt_assistant.chain_response_format(html_content)
        st.session_state['output_format'] = output
    except NameError:
        st.write("Complete the API key field")
    except AuthenticationError:
        st.write("Invalid API key")

if 'output_format' in st.session_state:
    output_format = st.code(st.session_state['output_format'], language="json")
    
    if st.button("Generate the code"):
        try:
            python_code = gpt_assistant.chain_code_generator(st.session_state['output_format'], html_content)
            st.session_state['code_generated'] = python_code
            st.session_state['code_generated_exec'] = python_code + "\nresult = extract_info(html_data)"

        except NameError:
            st.write("Complete the API key field")
        except AuthenticationError:
            st.write("Invalid API key")


if 'code_generated' in st.session_state:
    python_function_label = st.write("Here is your python function:")
    code_generated = st.code(st.session_state['code_generated'],language="python")
    full_content = st.text_input("Paste your complete HTML here:")
    test_code = st.button("Test the code")
    if full_content and test_code:
        html_data = full_content
        result = None
        exec(st.session_state['code_generated_exec'], globals())
        if result:
            st.write("data extracted successfully")
            # show data in table
            st.table(result)
        else:
            st.write("error extracting data")