supunTE's picture
create streamlit app
814b935
import base64
import copy
import json
from collections import Counter
from urllib.parse import urljoin
import streamlit as st
from bs4 import BeautifulSoup
def remove_svg_elements(element):
"""
Remove all SVG elements from a BeautifulSoup element.
Returns a copy of the element with SVGs removed.
"""
# Create a copy of the element to avoid modifying the original
element_copy = copy.copy(element)
# Find and remove all SVG elements
if hasattr(element_copy, 'find_all'):
svg_elements = element_copy.find_all('svg')
for svg in svg_elements:
svg.decompose()
return element_copy
def get_element_signature(element):
"""
Create a signature for an element based on its structure.
"""
signature = {
'tag': element.name,
'classes': tuple(sorted(element.get('class', []))),
'child_tags': tuple(sorted(child.name for child in element.find_all(recursive=False) if child.name)),
'has_image': bool(element.find('img')),
'has_price': bool(any(c in element.get_text() for c in '$€£¥')),
'has_link': bool(element.find('a')),
}
return str(signature)
def analyze_children_similarity(element):
"""
Analyze how similar the direct children of an element are.
"""
if not element.contents:
return 0, 0
child_signatures = [
get_element_signature(child)
for child in element.find_all(recursive=False)
if child.name
]
if not child_signatures:
return 0, 0
signature_counts = Counter(child_signatures)
most_common_sig, most_common_count = signature_counts.most_common(1)[0]
similarity_score = most_common_count / len(child_signatures)
return similarity_score, most_common_count
def count_images_in_element(element):
"""
Count all images within an element, including nested ones.
"""
return len(element.find_all('img', recursive=True))
def get_element_identifier(element):
"""
Create a unique identifier for an element including tag and classes.
"""
identifier = element.name
if element.get('class'):
identifier += f" .{' .'.join(element['class'])}"
if element.get('id'):
identifier += f" #{element['id']}"
return identifier
def convert_relative_urls(soup, base_url):
"""
Convert all relative URLs in the soup object to absolute URLs.
"""
for tag in soup.find_all(href=True):
tag['href'] = urljoin(base_url, tag['href'])
for tag in soup.find_all(src=True):
tag['src'] = urljoin(base_url, tag['src'])
for tag in soup.find_all(attrs={'data-src': True}):
tag['data-src'] = urljoin(base_url, tag['data-src'])
return soup
def find_image_rich_parents(html_content, base_url="", min_children=4, min_similarity=0.7):
"""
Find elements containing images and return both sorted list and detailed top element info.
"""
soup = BeautifulSoup(html_content, "html.parser")
# Convert relative URLs to absolute if base_url is provided
if base_url:
soup = convert_relative_urls(soup, base_url)
# Collect potential container elements with their scores
elements_with_scores = []
for element in soup.find_all():
if element.name in ['div', 'ul', 'section', 'main']:
similarity_score, similar_children_count = analyze_children_similarity(element)
image_count = count_images_in_element(element)
if similar_children_count >= min_children and similarity_score >= min_similarity and image_count > 0:
# Count products (direct children with images)
products_count = len([child for child in element.find_all(recursive=False)
if child.name and child.find('img', recursive=True)])
combined_score = (similarity_score * similar_children_count * image_count)
elements_with_scores.append((element, image_count, combined_score, products_count))
if not elements_with_scores:
return [], {"error": "No elements with images found"}, ""
# Sort by combined score
elements_with_scores.sort(key=lambda x: x[2], reverse=True)
# Process elements for sorted list output
sorted_elements = []
for element, image_count, _, products_count in elements_with_scores:
sorted_elements.append((get_element_identifier(element), image_count, products_count))
# Get top element (one with highest combined score)
top_element = elements_with_scores[0][0]
# Remove SVGs from the top element for HTML output
top_element_no_svg = remove_svg_elements(top_element)
# Separate child elements with images
products = []
for child in top_element_no_svg.find_all(recursive=False):
if child.name: # Skip text nodes
# Remove SVGs from each product
child_no_svg = remove_svg_elements(child)
product_info = {
"html_content": str(child_no_svg),
"images": []
}
# Get all images within this product
for img in child_no_svg.find_all('img', recursive=True):
image_info = {
"src": img.get('src', 'No source'),
"alt": img.get('alt', 'No alt text')
}
product_info["images"].append(image_info)
products.append(product_info)
# Create result dictionary for top element
top_element_info = {
"parent": {
"tag": top_element_no_svg.name,
"identifier": get_element_identifier(top_element_no_svg),
"classes": top_element_no_svg.get('class', []),
"id": top_element_no_svg.get('id', None)
},
"products_count": len(products),
"products": products
}
html_output = str(top_element_no_svg)
return sorted_elements, top_element_info, html_output
def get_download_link(content, filename, content_type="file/json"):
"""Generate a download link for the given content"""
b64 = base64.b64encode(content.encode()).decode()
return f'<a href="data:{content_type};base64,{b64}" download="{filename}">Download {filename}</a>'
def main():
st.title("HTML File Analyzer")
st.write("Upload HTML files to analyze their structure and find image-rich elements")
# File uploader allows multiple files
uploaded_files = st.file_uploader("Choose HTML files", accept_multiple_files=True, type=['html'])
if uploaded_files:
all_results = {}
all_html_outputs = {}
# Analysis parameters
col1, col2 = st.columns(2)
with col1:
min_children = st.slider("Minimum number of similar children", 1, 10, 4)
with col2:
min_similarity = st.slider("Minimum similarity score", 0.0, 1.0, 0.7)
# Generate button
if st.button("Generate Analysis"):
# Show processing message
with st.spinner('Processing files...'):
all_results = {}
all_html_outputs = {}
# Process each file
for uploaded_file in uploaded_files:
st.subheader(f"Analysis for {uploaded_file.name}")
try:
# Read and process the file
html_content = uploaded_file.read().decode('utf-8')
sorted_elements, top_element_info, html_output = find_image_rich_parents(
html_content,
min_children=min_children,
min_similarity=min_similarity
)
# Display results
st.write("Elements containing images:")
for element, img_count, prod_count in sorted_elements:
st.write(f"- {element}: {img_count} images, {prod_count} products")
# Store results
all_results[uploaded_file.name] = top_element_info
all_html_outputs[uploaded_file.name] = html_output
except Exception as e:
st.error(f"Error processing {uploaded_file.name}: {str(e)}")
continue
# Create download buttons if we have results
if all_results:
st.subheader("Download Results")
col1, col2 = st.columns(2)
# JSON download
with col1:
json_str = json.dumps(all_results, indent=2)
st.markdown(get_download_link(json_str, 'analysis_results.json'),
unsafe_allow_html=True)
# HTML download
with col2:
# Combine all HTML outputs with file names as headers
combined_html = """
<!DOCTYPE html>
<html>
<head>
<meta charset='UTF-8'>
<style>
div {
width: auto !important;
height: auto !important;
padding: 0 !important;
margin: 0 !important;
}
img {
width: 300px;
height: 300px;
object-fit: contain;
}
body { font-family: Arial, sans-serif; }
.file-section { margin: 20px 0; }
.file-header {
background: #f0f0f0;
padding: 10px;
margin: 20px 0;
}
</style>
</head>
<body>
"""
for filename, html in all_html_outputs.items():
combined_html += f"""
<div class="file-section">
<h2 class="file-header">{filename}</h2>
{html}
</div>
"""
combined_html += "</body></html>"
st.markdown(get_download_link(combined_html, 'analysis_results.html', 'text/html'),
unsafe_allow_html=True)
# Success message
st.success("Analysis completed successfully!")
if __name__ == "__main__":
main()