ND-Scrapper / Excel /streamlit_excel.py
viraj
Initial Commit
9e6917b
import pandas as pd
import requests
import json
from bs4 import BeautifulSoup
import streamlit as st
import mongoengine as me
import certifi
# Streamlit app setup
st.title("Restaurant Menu Data Extraction")
url1 = st.text_input("Please enter the restaurant URL:")
if url1:
# Extract outlet code from URL
outlet_code = url1.split('/')[-2]
st.write(f"Extracted outlet code: {outlet_code}")
# API request setup
url = "https://food.noon.com/_svc/mp-food-api-mpnoon/consumer/restaurant/outlet/details/guest"
payload = {
"addressLat": 244538840,
"addressLng": 543773438,
"deliveryType": "default",
"outletCode": outlet_code
}
headers = {
'Connection': 'keep-alive',
"Accept": "application/json, text/plain, */*",
"Accept-Encoding": "gzip, deflate, br, zstd",
"Accept-Language": "en-GB,en-US;q=0.9,en;q=0.8,gu;q=0.7",
"Cache-Control": "no-cache, max-age=0, must-revalidate, no-store",
"Content-Type": "application/json",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
"Cookie":"bm_mi=791533C8E67CE8E7DA98E80ADED70F69~YAAQRK0cuOep9tGPAQAAUYKw3RcGDAVhD+mtWU8IH76wZL29zl4qqCjMwGv8sKtYlQWZNaFftSOvHFOvQU4+3CY2uHZyjjK6I3GeNdKEn+XHupISeNc0K16GOXLqcPOwu4sADTmxE7PYQvSQE7eimhqsBiJVRd96R8W0D2hl31FlY/4rl+NPZvM3iXjrn2GO50VMv+HhGfCnDMBwApBxgpMWFLfs0u6EYy44mg/FXbom5s5pa3cro8AS35nYHbdUbi61K9fnWRVaF8L/4z0xh7V1AEQETevb5fdGF8aB9m2UG29p2W6KSMb8DyFZLpG3vl5+IRECqZdFxaUMnykO8G/ynRHG~1; Domain=.noon.com; Path=/; Expires=Mon, 03 Jun 2024 12:41:22 GMT; Max-Age=7199; Secure"
}
response = requests.post(url, headers=headers, json=payload)
if response.status_code == 200:
json_data = response.json()
with open(f'{outlet_code}.json', 'w') as json_file:
json.dump(json_data, json_file, indent=4)
st.success("JSON data has been written to json file.")
else:
st.error(f"Failed to retrieve content from the URL: {response.status_code}")
# Extract items and options
def extract_items_with_categories(menu):
items_list = []
for category in menu['categories']:
category_name = category['name']
category_position = category['position']
for item_code in category['items']:
item = next((item for item in menu['items'] if item['itemCode'] == item_code), None)
if item:
items_list.append({
'category': category_name,
'item': item['name'],
'itemCode': item['itemCode'],
'item-position': item['position'],
'img-url': item['image'],
'price': item['price'],
'Description': item['itemDesc'],
'position': category_position,
})
return items_list
def extract_options(menu):
options_list = []
for item in menu['items']:
if 'modifiers' in item:
for modifier_code in item['modifiers']:
modifier = next((modifier for modifier in menu['modifiers'] if modifier['modifierCode'] == modifier_code), None)
if modifier:
for option in modifier['options']:
option_item = next((item for item in menu['items'] if item['itemCode'] == option['itemCode']), None)
if option_item:
options_list.append({
'itemCode': item['itemCode'],
'itemName': item['name'],
'Option Group Name': modifier['name'],
'Min': modifier.get('minTotalOptions'),
'Max': modifier.get('maxTotalOptions'),
'Option name': option_item['name'],
'Option price': option['price']
})
return options_list
items_list = extract_items_with_categories(json_data['data']['menu'])
options_list = extract_options(json_data['data']['menu'])
def join_with_newline(x):
return '\n'.join(str(value) for value in x)
# Creating DataFrames with 'itemCode' column
df_items = pd.DataFrame(items_list)
df_options = pd.DataFrame(options_list)
# Merge the DataFrames based on 'itemCode'
final_df = pd.merge(df_items, df_options, on='itemCode', how='left')
final_df = final_df.groupby(['item', 'category'], as_index=False).agg({
'Option Group Name': join_with_newline,
'Option name': join_with_newline,
'Option price': join_with_newline,
'Min': join_with_newline,
'Max': join_with_newline,
'item-position': 'first',
'img-url': 'first',
'price': 'first',
'Description': 'first',
'item-position': 'first',
'position': 'first'
})
final_df = final_df.drop_duplicates().reset_index(drop=True)
def split_rows(df):
rows = []
for idx, row in df.iterrows():
min_values = row['Min'].split('\n')
max_values = row['Max'].split('\n')
option_groups = row['Option Group Name'].split('\n')
option_names = row['Option name'].split('\n')
option_prices = row['Option price'].split('\n')
for i in range(len(option_groups)):
current_row = {
'category': row['category'],
'item': row['item'],
'item-position': row['item-position'],
'img-url': row['img-url'],
'price': row['price'],
'Description': row['Description'],
'position': row['position'],
'Min': min_values[i] if i < len(min_values) else '',
'Max': max_values[i] if i < len(max_values) else '',
'Option Group Name': option_groups[i] if i < len(option_groups) else '',
'Option name': option_names[i] if i < len(option_names) else '',
'Option price': option_prices[i] if i < len(option_prices) else ''
}
rows.append(current_row)
return pd.DataFrame(rows)
split_df = split_rows(final_df)
split_df.fillna('', inplace=True)
split_df = split_df.replace('nan', '')
split_df.to_excel(f'{outlet_code}_1.xlsx', index=False)
# Final processing to JSON
df = pd.read_excel(f'{outlet_code}_1.xlsx')
json_data = df.to_json(orient='records')
with open(f'{outlet_code}_1.json', 'w') as f:
f.write(json_data)
# Process JSON data for final output
with open(f'{outlet_code}_1.json', 'r') as file:
data = json.load(file)
def process_item(item):
common_fields = {
'category': item['category'],
'category-position': item['position'],
'item': item['item'],
'item-position': item['item-position'],
'Description': item['Description'],
'img-url': item['img-url'],
'price': item['price']
}
options = []
for i in range(1, 38):
option_group_name = f"Option Group {i} Name"
option_name_key = f"Option {i} Name"
option_price_key = f"Option {i} Price"
min_values_key = f"Min{i}"
max_values_key = f"Max{i}"
if option_group_name in item and item[option_group_name]:
options.append({
'Option Group Name': item[option_group_name],
'Option Name': item[option_name_key],
'Option Price': item[option_price_key],
'Min': item[min_values_key],
'Max': item[max_values_key]
})
return {**common_fields, 'Options': options}
processed_items = [process_item(item) for item in data]
with open(f'processed_items_{outlet_code}.json', 'w') as outfile:
json.dump(processed_items, outfile, indent=4)
# Extract options
def extract_options(item):
option_data = {}
for option in item["Options"]:
option_group_name = option["Option Group Name"]
if option_group_name not in option_data:
option_data[option_group_name] = {
"Names": [],
"Prices": [],
"Mins": [],
"Maxs": [],
"GroupMin": option["Min"],
"GroupMax": option["Max"]
}
option_data[option_group_name]["Names"].append(option["Option Name"])
option_data[option_group_name]["Prices"].append(option["Option Price"])
option_data[option_group_name]["Mins"].append(option["Min"])
option_data[option_group_name]["Maxs"].append(option["Max"])
return option_data
with open(f'processed_items_{outlet_code}.json', 'r') as file:
data = json.load(file)
all_rows_data = []
for item in data:
item_info = {
'category': item['category'],
'category-position': item['category-position'],
'item': item['item'],
'item-position': item['item-position'],
'Description': item['Description'],
'img-url': item['img-url'],
'price': item['price']
}
options = extract_options(item)
max_options = 0
for group in options.values():
max_options = max(max_options, len(group["Names"]))
for i in range(max_options):
row = item_info.copy()
for group_name, group_data in options.items():
row[f"Option Group {group_name}"] = group_name
row[f"Option {group_name} Name"] = group_data["Names"][i] if i < len(group_data["Names"]) else ""
row[f"Option {group_name} Price"] = group_data["Prices"][i] if i < len(group_data["Prices"]) else ""
row[f"Min {group_name}"] = group_data["Mins"][i] if i < len(group_data["Mins"]) else ""
row[f"Max {group_name}"] = group_data["Maxs"][i] if i < len(group_data["Maxs"]) else ""
all_rows_data.append(row)
final_df = pd.DataFrame(all_rows_data)
final_df.to_excel(f'{outlet_code}_final_output.xlsx', index=False)
# MongoDB setup
mongo_host = st.secrets["mongo"]["host"]
mongo_username = st.secrets["mongo"]["username"]
mongo_password = st.secrets["mongo"]["password"]
# Connect to MongoDB
connection_str = f"mongodb+srv://{mongo_username}:{mongo_password}@{mongo_host}/test?retryWrites=true&w=majority"
me.connect(host=connection_str, tlsCAFile=certifi.where())
class Item(me.Document):
category = me.StringField()
category_position = me.IntField()
item = me.StringField()
item_position = me.IntField()
description = me.StringField()
img_url = me.StringField()
price = me.FloatField()
options = me.ListField()
items_collection = []
for index, row in final_df.iterrows():
options = []
for i in range(1, 38):
option_group = f"Option Group {i}"
option_name_key = f"Option {i} Name"
option_price_key = f"Option {i} Price"
min_key = f"Min {i}"
max_key = f"Max {i}"
if pd.notna(row[option_group]) and row[option_group]:
options.append({
"group_name": row[option_group],
"name": row[option_name_key],
"price": row[option_price_key],
"min": row[min_key],
"max": row[max_key]
})
item_doc = Item(
category=row['category'],
category_position=row['category-position'],
item=row['item'],
item_position=row['item-position'],
description=row['Description'],
img_url=row['img-url'],
price=row['price'],
options=options
)
items_collection.append(item_doc)
if items_collection:
Item.objects.insert(items_collection)
st.success("Data has been saved to MongoDB.")
else:
st.warning("No data to save to MongoDB.")