File size: 4,940 Bytes
f96e5ac
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import streamlit as st
import subprocess
import os

from jsonToText import convert_json_to_text
from llm import process_and_save_json

# Default XPath if not provided by the user
DEFAULT_XPATH = '//body'  # You can change this to whatever default XPath you prefer

# Function to update the spider with user inputs and run it
def run_spider(website_url, xpath):
    # Extract domain from the website URL
    domain = website_url.split("//")[-1].split("/")[0]
    
    # Update the spider file with user input (start_urls, custom_xpath, and allowed_domains)
    spider_path = 'webscraper/webscraper/spiders/websiteSpider.py'
    
    # Read the spider file
    with open(spider_path, 'r') as file:
        spider_code = file.readlines()

    # Modify start_urls, custom_xpath, and allowed_domains
    for idx, line in enumerate(spider_code):
        if line.strip().startswith('start_urls ='):
            spider_code[idx] = f'    start_urls = ["{website_url}"]\n'
        if line.strip().startswith('custom_xpath ='):
            spider_code[idx] = f'    custom_xpath = "{xpath}"\n'
        if line.strip().startswith('allowed_domains ='):
            spider_code[idx] = f'    allowed_domains = ["{domain}"]\n'

    # Write back the modified spider code
    with open(spider_path, 'w') as file:
        file.writelines(spider_code)

    # Run the Scrapy spider using subprocess
    scrapy_command = f'scrapy crawl websiteSpider'
    spider_dir = 'webscraper/webscraper'  # Set the directory where the spider is located
    subprocess.run(scrapy_command, cwd=spider_dir, shell=True)


# Streamlit UI
st.title('Web Scraper Interface')

# User input for website link and XPath
website_url = st.text_input('Enter the website URL:', '')
xpath = st.text_input('Enter the XPath:', DEFAULT_XPATH)  # Default XPath if not provided

# Variable to check if spider has been run
spider_ran = False

# Button to run the spider
if st.button('Run Spider'):
    if website_url:
        st.write(f'Running the spider on {website_url} using XPath: {xpath}')
        run_spider(website_url, xpath)
        st.success('Spider finished running!')
        convert_json_to_text("webscraper/webscraper/scraped.json", "output.txt")
        spider_ran = True
    else:
        st.error('Please provide a website URL.')

# If spider has been run, show download buttons
if spider_ran or os.path.exists("webscraper/webscraper/scraped.json"):
    # Add an option to download the output.txt file
    with open("output.txt", "r") as file:
        st.download_button(
            label="Download Output Text",
            data=file,
            file_name="output.txt",
            mime="text/plain"
        )
    
    # Add an option to download the scraped.json file
    with open("webscraper/webscraper/scraped.json", "r") as json_file:
        st.download_button(
            label="Download Scraped JSON",
            data=json_file,
            file_name="scraped.json",
            mime="application/json"
        )

# Title for organizing section
st.title("Do you want to organize the scraped data?")

# Use session state to track if the user has clicked "Yes"
if "organize_requested" not in st.session_state:
    st.session_state.organize_requested = False

# Button to toggle the organize section
if st.button("Yes"):
    st.session_state.organize_requested = True

# If user clicked "Yes", show input for 'about', 'API key', and organize button
if st.session_state.organize_requested:
    # Input for 'about' to describe the data for organizing
    about = st.text_input('Enter one or two words that describe the data like "books" or "events":', '')

    # Input for custom details to organize the data (e.g., name, price, stock)
    details = st.text_input('Enter the details to extract (comma separated) like name, date', '')

    # Input for the API key
    api_key = st.text_input('Enter your Groq API key:', type="password")

    # Button to organize and save JSON
    if st.button("Organize"):
        if about and details and api_key:
            # Convert comma-separated details into a list
            details_list = [detail.strip() for detail in details.split(',')]
            
            # Process and save the JSON with the provided details and API key
            process_and_save_json("output.txt", "organize.json", api_key=api_key, about=about, details=details_list)
            st.success('Data has been organized and saved to organize.json.')
            
            # Add an option to download the organized JSON file
            with open("organize.json", "r") as organized_json_file:
                st.download_button(
                    label="Download Organized JSON",
                    data=organized_json_file,
                    file_name="organize.json",
                    mime="application/json"
                )
        else:
            st.error('Please provide a description, details, and your API key before organizing.')