Spaces:

ParitKansal
/

tempAutoScraping

Sleeping

File size: 4,940 Bytes

f96e5ac

import streamlit as st
import subprocess
import os

from jsonToText import convert_json_to_text
from llm import process_and_save_json

# Default XPath if not provided by the user
DEFAULT_XPATH = '//body'  # You can change this to whatever default XPath you prefer

# Function to update the spider with user inputs and run it
def run_spider(website_url, xpath):
    # Extract domain from the website URL
    domain = website_url.split("//")[-1].split("/")[0]
    
    # Update the spider file with user input (start_urls, custom_xpath, and allowed_domains)
    spider_path = 'webscraper/webscraper/spiders/websiteSpider.py'
    
    # Read the spider file
    with open(spider_path, 'r') as file:
        spider_code = file.readlines()

    # Modify start_urls, custom_xpath, and allowed_domains
    for idx, line in enumerate(spider_code):
        if line.strip().startswith('start_urls ='):
            spider_code[idx] = f'    start_urls = ["{website_url}"]\n'
        if line.strip().startswith('custom_xpath ='):
            spider_code[idx] = f'    custom_xpath = "{xpath}"\n'
        if line.strip().startswith('allowed_domains ='):
            spider_code[idx] = f'    allowed_domains = ["{domain}"]\n'

    # Write back the modified spider code
    with open(spider_path, 'w') as file:
        file.writelines(spider_code)

    # Run the Scrapy spider using subprocess
    scrapy_command = f'scrapy crawl websiteSpider'
    spider_dir = 'webscraper/webscraper'  # Set the directory where the spider is located
    subprocess.run(scrapy_command, cwd=spider_dir, shell=True)


# Streamlit UI
st.title('Web Scraper Interface')

# User input for website link and XPath
website_url = st.text_input('Enter the website URL:', '')
xpath = st.text_input('Enter the XPath:', DEFAULT_XPATH)  # Default XPath if not provided

# Variable to check if spider has been run
spider_ran = False

# Button to run the spider
if st.button('Run Spider'):
    if website_url:
        st.write(f'Running the spider on {website_url} using XPath: {xpath}')
        run_spider(website_url, xpath)
        st.success('Spider finished running!')
        convert_json_to_text("webscraper/webscraper/scraped.json", "output.txt")
        spider_ran = True
    else:
        st.error('Please provide a website URL.')

# If spider has been run, show download buttons
if spider_ran or os.path.exists("webscraper/webscraper/scraped.json"):
    # Add an option to download the output.txt file
    with open("output.txt", "r") as file:
        st.download_button(
            label="Download Output Text",
            data=file,
            file_name="output.txt",
            mime="text/plain"
        )
    
    # Add an option to download the scraped.json file
    with open("webscraper/webscraper/scraped.json", "r") as json_file:
        st.download_button(
            label="Download Scraped JSON",
            data=json_file,
            file_name="scraped.json",
            mime="application/json"
        )

# Title for organizing section
st.title("Do you want to organize the scraped data?")

# Use session state to track if the user has clicked "Yes"
if "organize_requested" not in st.session_state:
    st.session_state.organize_requested = False

# Button to toggle the organize section
if st.button("Yes"):
    st.session_state.organize_requested = True

# If user clicked "Yes", show input for 'about', 'API key', and organize button
if st.session_state.organize_requested:
    # Input for 'about' to describe the data for organizing
    about = st.text_input('Enter one or two words that describe the data like "books" or "events":', '')

    # Input for custom details to organize the data (e.g., name, price, stock)
    details = st.text_input('Enter the details to extract (comma separated) like name, date', '')

    # Input for the API key
    api_key = st.text_input('Enter your Groq API key:', type="password")

    # Button to organize and save JSON
    if st.button("Organize"):
        if about and details and api_key:
            # Convert comma-separated details into a list
            details_list = [detail.strip() for detail in details.split(',')]
            
            # Process and save the JSON with the provided details and API key
            process_and_save_json("output.txt", "organize.json", api_key=api_key, about=about, details=details_list)
            st.success('Data has been organized and saved to organize.json.')
            
            # Add an option to download the organized JSON file
            with open("organize.json", "r") as organized_json_file:
                st.download_button(
                    label="Download Organized JSON",
                    data=organized_json_file,
                    file_name="organize.json",
                    mime="application/json"
                )
        else:
            st.error('Please provide a description, details, and your API key before organizing.')