Spaces:
Sleeping
Sleeping
import base64 | |
import copy | |
import json | |
from collections import Counter | |
from urllib.parse import urljoin | |
import streamlit as st | |
from bs4 import BeautifulSoup | |
def remove_svg_elements(element): | |
""" | |
Remove all SVG elements from a BeautifulSoup element. | |
Returns a copy of the element with SVGs removed. | |
""" | |
# Create a copy of the element to avoid modifying the original | |
element_copy = copy.copy(element) | |
# Find and remove all SVG elements | |
if hasattr(element_copy, 'find_all'): | |
svg_elements = element_copy.find_all('svg') | |
for svg in svg_elements: | |
svg.decompose() | |
return element_copy | |
def get_element_signature(element): | |
""" | |
Create a signature for an element based on its structure. | |
""" | |
signature = { | |
'tag': element.name, | |
'classes': tuple(sorted(element.get('class', []))), | |
'child_tags': tuple(sorted(child.name for child in element.find_all(recursive=False) if child.name)), | |
'has_image': bool(element.find('img')), | |
'has_price': bool(any(c in element.get_text() for c in '$€£¥')), | |
'has_link': bool(element.find('a')), | |
} | |
return str(signature) | |
def analyze_children_similarity(element): | |
""" | |
Analyze how similar the direct children of an element are. | |
""" | |
if not element.contents: | |
return 0, 0 | |
child_signatures = [ | |
get_element_signature(child) | |
for child in element.find_all(recursive=False) | |
if child.name | |
] | |
if not child_signatures: | |
return 0, 0 | |
signature_counts = Counter(child_signatures) | |
most_common_sig, most_common_count = signature_counts.most_common(1)[0] | |
similarity_score = most_common_count / len(child_signatures) | |
return similarity_score, most_common_count | |
def count_images_in_element(element): | |
""" | |
Count all images within an element, including nested ones. | |
""" | |
return len(element.find_all('img', recursive=True)) | |
def get_element_identifier(element): | |
""" | |
Create a unique identifier for an element including tag and classes. | |
""" | |
identifier = element.name | |
if element.get('class'): | |
identifier += f" .{' .'.join(element['class'])}" | |
if element.get('id'): | |
identifier += f" #{element['id']}" | |
return identifier | |
def convert_relative_urls(soup, base_url): | |
""" | |
Convert all relative URLs in the soup object to absolute URLs. | |
""" | |
for tag in soup.find_all(href=True): | |
tag['href'] = urljoin(base_url, tag['href']) | |
for tag in soup.find_all(src=True): | |
tag['src'] = urljoin(base_url, tag['src']) | |
for tag in soup.find_all(attrs={'data-src': True}): | |
tag['data-src'] = urljoin(base_url, tag['data-src']) | |
return soup | |
def find_image_rich_parents(html_content, base_url="", min_children=4, min_similarity=0.7): | |
""" | |
Find elements containing images and return both sorted list and detailed top element info. | |
""" | |
soup = BeautifulSoup(html_content, "html.parser") | |
# Convert relative URLs to absolute if base_url is provided | |
if base_url: | |
soup = convert_relative_urls(soup, base_url) | |
# Collect potential container elements with their scores | |
elements_with_scores = [] | |
for element in soup.find_all(): | |
if element.name in ['div', 'ul', 'section', 'main']: | |
similarity_score, similar_children_count = analyze_children_similarity(element) | |
image_count = count_images_in_element(element) | |
if similar_children_count >= min_children and similarity_score >= min_similarity and image_count > 0: | |
# Count products (direct children with images) | |
products_count = len([child for child in element.find_all(recursive=False) | |
if child.name and child.find('img', recursive=True)]) | |
combined_score = (similarity_score * similar_children_count * image_count) | |
elements_with_scores.append((element, image_count, combined_score, products_count)) | |
if not elements_with_scores: | |
return [], {"error": "No elements with images found"}, "" | |
# Sort by combined score | |
elements_with_scores.sort(key=lambda x: x[2], reverse=True) | |
# Process elements for sorted list output | |
sorted_elements = [] | |
for element, image_count, _, products_count in elements_with_scores: | |
sorted_elements.append((get_element_identifier(element), image_count, products_count)) | |
# Get top element (one with highest combined score) | |
top_element = elements_with_scores[0][0] | |
# Remove SVGs from the top element for HTML output | |
top_element_no_svg = remove_svg_elements(top_element) | |
# Separate child elements with images | |
products = [] | |
for child in top_element_no_svg.find_all(recursive=False): | |
if child.name: # Skip text nodes | |
# Remove SVGs from each product | |
child_no_svg = remove_svg_elements(child) | |
product_info = { | |
"html_content": str(child_no_svg), | |
"images": [] | |
} | |
# Get all images within this product | |
for img in child_no_svg.find_all('img', recursive=True): | |
image_info = { | |
"src": img.get('src', 'No source'), | |
"alt": img.get('alt', 'No alt text') | |
} | |
product_info["images"].append(image_info) | |
products.append(product_info) | |
# Create result dictionary for top element | |
top_element_info = { | |
"parent": { | |
"tag": top_element_no_svg.name, | |
"identifier": get_element_identifier(top_element_no_svg), | |
"classes": top_element_no_svg.get('class', []), | |
"id": top_element_no_svg.get('id', None) | |
}, | |
"products_count": len(products), | |
"products": products | |
} | |
html_output = str(top_element_no_svg) | |
return sorted_elements, top_element_info, html_output | |
def get_download_link(content, filename, content_type="file/json"): | |
"""Generate a download link for the given content""" | |
b64 = base64.b64encode(content.encode()).decode() | |
return f'<a href="data:{content_type};base64,{b64}" download="{filename}">Download {filename}</a>' | |
def main(): | |
st.title("HTML File Analyzer") | |
st.write("Upload HTML files to analyze their structure and find image-rich elements") | |
# File uploader allows multiple files | |
uploaded_files = st.file_uploader("Choose HTML files", accept_multiple_files=True, type=['html']) | |
if uploaded_files: | |
all_results = {} | |
all_html_outputs = {} | |
# Analysis parameters | |
col1, col2 = st.columns(2) | |
with col1: | |
min_children = st.slider("Minimum number of similar children", 1, 10, 4) | |
with col2: | |
min_similarity = st.slider("Minimum similarity score", 0.0, 1.0, 0.7) | |
# Generate button | |
if st.button("Generate Analysis"): | |
# Show processing message | |
with st.spinner('Processing files...'): | |
all_results = {} | |
all_html_outputs = {} | |
# Process each file | |
for uploaded_file in uploaded_files: | |
st.subheader(f"Analysis for {uploaded_file.name}") | |
try: | |
# Read and process the file | |
html_content = uploaded_file.read().decode('utf-8') | |
sorted_elements, top_element_info, html_output = find_image_rich_parents( | |
html_content, | |
min_children=min_children, | |
min_similarity=min_similarity | |
) | |
# Display results | |
st.write("Elements containing images:") | |
for element, img_count, prod_count in sorted_elements: | |
st.write(f"- {element}: {img_count} images, {prod_count} products") | |
# Store results | |
all_results[uploaded_file.name] = top_element_info | |
all_html_outputs[uploaded_file.name] = html_output | |
except Exception as e: | |
st.error(f"Error processing {uploaded_file.name}: {str(e)}") | |
continue | |
# Create download buttons if we have results | |
if all_results: | |
st.subheader("Download Results") | |
col1, col2 = st.columns(2) | |
# JSON download | |
with col1: | |
json_str = json.dumps(all_results, indent=2) | |
st.markdown(get_download_link(json_str, 'analysis_results.json'), | |
unsafe_allow_html=True) | |
# HTML download | |
with col2: | |
# Combine all HTML outputs with file names as headers | |
combined_html = """ | |
<!DOCTYPE html> | |
<html> | |
<head> | |
<meta charset='UTF-8'> | |
<style> | |
div { | |
width: auto !important; | |
height: auto !important; | |
padding: 0 !important; | |
margin: 0 !important; | |
} | |
img { | |
width: 300px; | |
height: 300px; | |
object-fit: contain; | |
} | |
body { font-family: Arial, sans-serif; } | |
.file-section { margin: 20px 0; } | |
.file-header { | |
background: #f0f0f0; | |
padding: 10px; | |
margin: 20px 0; | |
} | |
</style> | |
</head> | |
<body> | |
""" | |
for filename, html in all_html_outputs.items(): | |
combined_html += f""" | |
<div class="file-section"> | |
<h2 class="file-header">{filename}</h2> | |
{html} | |
</div> | |
""" | |
combined_html += "</body></html>" | |
st.markdown(get_download_link(combined_html, 'analysis_results.html', 'text/html'), | |
unsafe_allow_html=True) | |
# Success message | |
st.success("Analysis completed successfully!") | |
if __name__ == "__main__": | |
main() |