File size: 4,021 Bytes
3505899
 
 
1da5fb5
3505899
d6579b5
3505899
 
 
 
 
 
d6579b5
 
 
 
 
c739942
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39b45c6
c739942
c8114a5
3505899
014a1d9
3505899
 
 
 
 
c739942
 
 
cfff8c5
 
c739942
 
07a1b62
ae663ee
c739942
7cba56b
07a1b62
5a6cb7b
 
 
c739942
5a6cb7b
 
23b48d0
07a1b62
23b48d0
490445b
 
b95bc34
490445b
 
 
 
 
 
3505899
 
 
 
b95bc34
 
 
 
 
 
 
 
 
 
1da5fb5
 
 
 
 
 
 
 
 
 
 
 
 
3505899
 
 
 
 
490445b
 
3505899
 
1da5fb5
6678ed6
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
from AssistantService import GPTAssistant
from openai.error import AuthenticationError
import streamlit as st
from langsmith.run_helpers import traceable
import configparser
import os

config = configparser.ConfigParser()
config.read('config.ini')
if 'DEFAULT' in config:
    assistant_api_key = config['DEFAULT'].get('API-KEY', '')

os.environ["LANGCHAIN_TRACING_V2"]="true"
os.environ["LANGCHAIN_ENDPOINT"]="https://api.smith.langchain.com"
os.environ["LANGCHAIN_API_KEY"]=st.secrets["LANGCHAIN_API_KEY"]
os.environ["LANGCHAIN_PROJECT"]=st.secrets["LANGCHAIN_PROJECT"]

@traceable(run_type="tool")
def start():
    st.write("This app helps you to extract data from HTML code using web scraping. It uses GPT-3.5-turbo to generate the code for you. \n *Contribute to this project on [GitHub](https://github.com/CognitiveLabs/GPT-auto-webscraping)*")
    
    with st.expander(label="Check out the video demo"):
        yt_video = st.video("https://www.youtube.com/watch?v=_zeCun4OlCc")
    
    info_text = """
    **Quick start** \n
    Fill the input with <HTML code>.
    * Choose a repeating element on the page, like a product on a list.
    * Inspect the HTML code and copy the element.
    
    After generating the "output format" and the code, paste the complete HTML code of the page in the last input to test it
    """
    st.write(info_text)
    st.image("https://j.gifs.com/gpqvPl.gif")

start()

if assistant_api_key == '':
    assistant_api_key = st.secrets["API_KEY"]
    if assistant_api_key:
        gpt_assistant = GPTAssistant(assistant_api_key)
else:
    gpt_assistant = GPTAssistant(assistant_api_key)

@traceable(run_type="tool")
def invalid_input(html):
    # TODO: more checks
    print(html)
    return True


html_content = None
# check if html_content is an url, and show error if it is

def html_content_input():
    html_content = st.text_input("Paste the HTML tags of the item you want to extract:", max_chars=10000, help="example: <li>Product 1 </li>, watch the video above")
    if html_content:
        if html_content.startswith("http"):
            st.write("Please paste the HTML piece code, not the URL")
            invalid_input(html)
    
    return st.button("Generate output format & code")

extract_button = html_content_input()

if html_content and extract_button:
    try:
        st.write("1/2: Generating the output format...")
        output = gpt_assistant.chain_response_format(html_content)
        st.session_state['output_format'] = output
    except NameError:
        st.write("Complete the API key field")
    except AuthenticationError:
        st.write("Invalid API key")

if 'output_format' in st.session_state:
    output_format = st.code(st.session_state['output_format'], language="json")
    
    try:
        st.write("2/2: Generating the code...")
        python_code = gpt_assistant.chain_code_generator(st.session_state['output_format'], html_content)
        st.session_state['code_generated'] = python_code
        st.session_state['code_generated_exec'] = python_code + "\nresult = extract_info(html_data)"

    except NameError:
        st.write("Complete the API key field")
    except AuthenticationError:
        st.write("Invalid API key")
            
@traceable(run_type="tool")
def test_the_code(code, full_content):
    exec(code, globals())
    if result:
        st.write("data extracted successfully")
        # show data in table
        st.table(result)
    else:
        st.write("error extracting data")
        
    return result or "error"
    

if 'code_generated' in st.session_state:
    python_function_label = st.write("Here is your python function:")
    code_generated = st.code(st.session_state['code_generated'],language="python")
    full_content = st.text_input("Paste your complete HTML here:")
    test_code = st.button("Test the code")
    if full_content and test_code:
        html_data = full_content
        result = None
        test_the_code(st.session_state['code_generated_exec'], full_content=full_content)