File size: 3,367 Bytes
3505899
 
 
1da5fb5
3505899
d6579b5
3505899
 
 
 
 
 
d6579b5
 
 
 
 
f9159cc
c8114a5
 
cdc5bd9
c8114a5
39b45c6
a7e7db9
39b45c6
 
 
 
 
c8114a5
3505899
014a1d9
3505899
 
 
 
 
d19e286
014a1d9
ae663ee
 
 
 
 
23b48d0
 
 
490445b
 
 
 
 
 
 
 
3505899
 
 
 
 
 
 
 
 
 
 
 
 
 
1da5fb5
 
 
 
 
 
 
 
 
 
 
 
 
3505899
 
 
 
 
490445b
 
3505899
 
1da5fb5
6678ed6
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
from AssistantService import GPTAssistant
from openai.error import AuthenticationError
import streamlit as st
from langsmith.run_helpers import traceable
import configparser
import os

config = configparser.ConfigParser()
config.read('config.ini')
if 'DEFAULT' in config:
    assistant_api_key = config['DEFAULT'].get('API-KEY', '')

os.environ["LANGCHAIN_TRACING_V2"]="true"
os.environ["LANGCHAIN_ENDPOINT"]="https://api.smith.langchain.com"
os.environ["LANGCHAIN_API_KEY"]=st.secrets["LANGCHAIN_API_KEY"]
os.environ["LANGCHAIN_PROJECT"]=st.secrets["LANGCHAIN_PROJECT"]

st.write("This app helps you to extract data from HTML code using web scraping. It uses GPT-3.5-turbo to generate the code for you. \n *Contribute to this project on [GitHub](https://github.com/CognitiveLabs/GPT-auto-webscraping)*")

with st.expander(label="Check out the video demo"):
    yt_video = st.video("https://www.youtube.com/watch?v=_zeCun4OlCc")

info_text = """
**Quick start** \n
Fill the input with the HTML code you want to extract data from
"""
st.write(info_text)
st.image("https://j.gifs.com/gpqvPl.gif")


if assistant_api_key == '':
    assistant_api_key = st.secrets["API_KEY"]
    if assistant_api_key:
        gpt_assistant = GPTAssistant(assistant_api_key)
else:
    gpt_assistant = GPTAssistant(assistant_api_key)


html_content = st.text_input("Paste your piece of HTML here:", max_chars=10000)
# check if html_content is an url, and show error if it is
if html_content:
    if html_content.startswith("http"):
        st.write("Please paste the HTML piece code, not the URL")
        html_content = None

extract_button = st.button("Extract data format")

if html_content and extract_button:
    try:
        output = gpt_assistant.chain_response_format(html_content)
        st.session_state['output_format'] = output
    except NameError:
        st.write("Complete the API key field")
    except AuthenticationError:
        st.write("Invalid API key")

if 'output_format' in st.session_state:
    output_format = st.code(st.session_state['output_format'], language="json")
    
    if st.button("Generate the code"):
        try:
            python_code = gpt_assistant.chain_code_generator(st.session_state['output_format'], html_content)
            st.session_state['code_generated'] = python_code
            st.session_state['code_generated_exec'] = python_code + "\nresult = extract_info(html_data)"

        except NameError:
            st.write("Complete the API key field")
        except AuthenticationError:
            st.write("Invalid API key")
            
@traceable(run_type="tool")
def test_the_code(code, full_content):
    exec(code, globals())
    if result:
        st.write("data extracted successfully")
        # show data in table
        st.table(result)
    else:
        st.write("error extracting data")
        
    return result or "error"
    

if 'code_generated' in st.session_state:
    python_function_label = st.write("Here is your python function:")
    code_generated = st.code(st.session_state['code_generated'],language="python")
    full_content = st.text_input("Paste your complete HTML here:")
    test_code = st.button("Test the code")
    if full_content and test_code:
        html_data = full_content
        result = None
        test_the_code(st.session_state['code_generated_exec'], full_content=full_content)