Spaces:

supunTE
/

products-extracter

Sleeping

App Files Files Community

products-extracter / app.py

supunTE

create streamlit app

814b935 8 months ago

raw

history blame contribute delete

11.1 kB

	import base64
	import copy
	import json
	from collections import Counter
	from urllib.parse import urljoin

	import streamlit as st
	from bs4 import BeautifulSoup


	def remove_svg_elements(element):
	"""
	Remove all SVG elements from a BeautifulSoup element.
	Returns a copy of the element with SVGs removed.
	"""
	# Create a copy of the element to avoid modifying the original
	element_copy = copy.copy(element)

	# Find and remove all SVG elements
	if hasattr(element_copy, 'find_all'):
	svg_elements = element_copy.find_all('svg')
	for svg in svg_elements:
	svg.decompose()

	return element_copy

	def get_element_signature(element):
	"""
	Create a signature for an element based on its structure.
	"""
	signature = {
	'tag': element.name,
	'classes': tuple(sorted(element.get('class', []))),
	'child_tags': tuple(sorted(child.name for child in element.find_all(recursive=False) if child.name)),
	'has_image': bool(element.find('img')),
	'has_price': bool(any(c in element.get_text() for c in '$€£¥')),
	'has_link': bool(element.find('a')),
	}
	return str(signature)

	def analyze_children_similarity(element):
	"""
	Analyze how similar the direct children of an element are.
	"""
	if not element.contents:
	return 0, 0

	child_signatures = [
	get_element_signature(child)
	for child in element.find_all(recursive=False)
	if child.name
	]

	if not child_signatures:
	return 0, 0

	signature_counts = Counter(child_signatures)
	most_common_sig, most_common_count = signature_counts.most_common(1)[0]
	similarity_score = most_common_count / len(child_signatures)

	return similarity_score, most_common_count

	def count_images_in_element(element):
	"""
	Count all images within an element, including nested ones.
	"""
	return len(element.find_all('img', recursive=True))

	def get_element_identifier(element):
	"""
	Create a unique identifier for an element including tag and classes.
	"""
	identifier = element.name
	if element.get('class'):
	identifier += f" .{' .'.join(element['class'])}"
	if element.get('id'):
	identifier += f" #{element['id']}"
	return identifier

	def convert_relative_urls(soup, base_url):
	"""
	Convert all relative URLs in the soup object to absolute URLs.
	"""
	for tag in soup.find_all(href=True):
	tag['href'] = urljoin(base_url, tag['href'])
	for tag in soup.find_all(src=True):
	tag['src'] = urljoin(base_url, tag['src'])
	for tag in soup.find_all(attrs={'data-src': True}):
	tag['data-src'] = urljoin(base_url, tag['data-src'])
	return soup

	def find_image_rich_parents(html_content, base_url="", min_children=4, min_similarity=0.7):
	"""
	Find elements containing images and return both sorted list and detailed top element info.
	"""
	soup = BeautifulSoup(html_content, "html.parser")

	# Convert relative URLs to absolute if base_url is provided
	if base_url:
	soup = convert_relative_urls(soup, base_url)

	# Collect potential container elements with their scores
	elements_with_scores = []
	for element in soup.find_all():
	if element.name in ['div', 'ul', 'section', 'main']:
	similarity_score, similar_children_count = analyze_children_similarity(element)
	image_count = count_images_in_element(element)

	if similar_children_count >= min_children and similarity_score >= min_similarity and image_count > 0:
	# Count products (direct children with images)
	products_count = len([child for child in element.find_all(recursive=False)
	if child.name and child.find('img', recursive=True)])

	combined_score = (similarity_score * similar_children_count * image_count)
	elements_with_scores.append((element, image_count, combined_score, products_count))

	if not elements_with_scores:
	return [], {"error": "No elements with images found"}, ""

	# Sort by combined score
	elements_with_scores.sort(key=lambda x: x[2], reverse=True)

	# Process elements for sorted list output
	sorted_elements = []
	for element, image_count, _, products_count in elements_with_scores:
	sorted_elements.append((get_element_identifier(element), image_count, products_count))

	# Get top element (one with highest combined score)
	top_element = elements_with_scores[0][0]

	# Remove SVGs from the top element for HTML output
	top_element_no_svg = remove_svg_elements(top_element)

	# Separate child elements with images
	products = []
	for child in top_element_no_svg.find_all(recursive=False):
	if child.name: # Skip text nodes
	# Remove SVGs from each product
	child_no_svg = remove_svg_elements(child)
	product_info = {
	"html_content": str(child_no_svg),
	"images": []
	}

	# Get all images within this product
	for img in child_no_svg.find_all('img', recursive=True):
	image_info = {
	"src": img.get('src', 'No source'),
	"alt": img.get('alt', 'No alt text')
	}
	product_info["images"].append(image_info)

	products.append(product_info)

	# Create result dictionary for top element
	top_element_info = {
	"parent": {
	"tag": top_element_no_svg.name,
	"identifier": get_element_identifier(top_element_no_svg),
	"classes": top_element_no_svg.get('class', []),
	"id": top_element_no_svg.get('id', None)
	},
	"products_count": len(products),
	"products": products
	}

	html_output = str(top_element_no_svg)

	return sorted_elements, top_element_info, html_output

	def get_download_link(content, filename, content_type="file/json"):
	"""Generate a download link for the given content"""
	b64 = base64.b64encode(content.encode()).decode()
	return f'<a href="data:{content_type};base64,{b64}" download="{filename}">Download {filename}</a>'

	def main():
	st.title("HTML File Analyzer")
	st.write("Upload HTML files to analyze their structure and find image-rich elements")

	# File uploader allows multiple files
	uploaded_files = st.file_uploader("Choose HTML files", accept_multiple_files=True, type=['html'])

	if uploaded_files:
	all_results = {}
	all_html_outputs = {}

	# Analysis parameters
	col1, col2 = st.columns(2)
	with col1:
	min_children = st.slider("Minimum number of similar children", 1, 10, 4)
	with col2:
	min_similarity = st.slider("Minimum similarity score", 0.0, 1.0, 0.7)

	# Generate button
	if st.button("Generate Analysis"):
	# Show processing message
	with st.spinner('Processing files...'):
	all_results = {}
	all_html_outputs = {}

	# Process each file
	for uploaded_file in uploaded_files:
	st.subheader(f"Analysis for {uploaded_file.name}")

	try:
	# Read and process the file
	html_content = uploaded_file.read().decode('utf-8')
	sorted_elements, top_element_info, html_output = find_image_rich_parents(
	html_content,
	min_children=min_children,
	min_similarity=min_similarity
	)

	# Display results
	st.write("Elements containing images:")
	for element, img_count, prod_count in sorted_elements:
	st.write(f"- {element}: {img_count} images, {prod_count} products")

	# Store results
	all_results[uploaded_file.name] = top_element_info
	all_html_outputs[uploaded_file.name] = html_output

	except Exception as e:
	st.error(f"Error processing {uploaded_file.name}: {str(e)}")
	continue

	# Create download buttons if we have results
	if all_results:
	st.subheader("Download Results")
	col1, col2 = st.columns(2)

	# JSON download
	with col1:
	json_str = json.dumps(all_results, indent=2)
	st.markdown(get_download_link(json_str, 'analysis_results.json'),
	unsafe_allow_html=True)

	# HTML download
	with col2:
	# Combine all HTML outputs with file names as headers
	combined_html = """
	<!DOCTYPE html>
	<html>
	<head>
	<meta charset='UTF-8'>
	<style>
	div {
	width: auto !important;
	height: auto !important;
	padding: 0 !important;
	margin: 0 !important;
	}
	img {
	width: 300px;
	height: 300px;
	object-fit: contain;
	}
	body { font-family: Arial, sans-serif; }
	.file-section { margin: 20px 0; }
	.file-header {
	background: #f0f0f0;
	padding: 10px;
	margin: 20px 0;
	}
	</style>
	</head>
	<body>
	"""
	for filename, html in all_html_outputs.items():
	combined_html += f"""
	<div class="file-section">
	<h2 class="file-header">{filename}</h2>
	{html}
	</div>
	"""
	combined_html += "</body></html>"

	st.markdown(get_download_link(combined_html, 'analysis_results.html', 'text/html'),
	unsafe_allow_html=True)

	# Success message
	st.success("Analysis completed successfully!")


	if __name__ == "__main__":
	main()