Spaces:
Sleeping
Sleeping
import pandas as pd | |
import requests | |
import json | |
from bs4 import BeautifulSoup | |
import streamlit as st | |
import mongoengine as me | |
import certifi | |
# Streamlit app setup | |
st.title("Restaurant Menu Data Extraction") | |
url1 = st.text_input("Please enter the restaurant URL:") | |
if url1: | |
# Extract outlet code from URL | |
outlet_code = url1.split('/')[-2] | |
st.write(f"Extracted outlet code: {outlet_code}") | |
# API request setup | |
url = "https://food.noon.com/_svc/mp-food-api-mpnoon/consumer/restaurant/outlet/details/guest" | |
payload = { | |
"addressLat": 244538840, | |
"addressLng": 543773438, | |
"deliveryType": "default", | |
"outletCode": outlet_code | |
} | |
headers = { | |
'Connection': 'keep-alive', | |
"Accept": "application/json, text/plain, */*", | |
"Accept-Encoding": "gzip, deflate, br, zstd", | |
"Accept-Language": "en-GB,en-US;q=0.9,en;q=0.8,gu;q=0.7", | |
"Cache-Control": "no-cache, max-age=0, must-revalidate, no-store", | |
"Content-Type": "application/json", | |
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36", | |
"Cookie":"bm_mi=791533C8E67CE8E7DA98E80ADED70F69~YAAQRK0cuOep9tGPAQAAUYKw3RcGDAVhD+mtWU8IH76wZL29zl4qqCjMwGv8sKtYlQWZNaFftSOvHFOvQU4+3CY2uHZyjjK6I3GeNdKEn+XHupISeNc0K16GOXLqcPOwu4sADTmxE7PYQvSQE7eimhqsBiJVRd96R8W0D2hl31FlY/4rl+NPZvM3iXjrn2GO50VMv+HhGfCnDMBwApBxgpMWFLfs0u6EYy44mg/FXbom5s5pa3cro8AS35nYHbdUbi61K9fnWRVaF8L/4z0xh7V1AEQETevb5fdGF8aB9m2UG29p2W6KSMb8DyFZLpG3vl5+IRECqZdFxaUMnykO8G/ynRHG~1; Domain=.noon.com; Path=/; Expires=Mon, 03 Jun 2024 12:41:22 GMT; Max-Age=7199; Secure" | |
} | |
response = requests.post(url, headers=headers, json=payload) | |
if response.status_code == 200: | |
json_data = response.json() | |
with open(f'{outlet_code}.json', 'w') as json_file: | |
json.dump(json_data, json_file, indent=4) | |
st.success("JSON data has been written to json file.") | |
else: | |
st.error(f"Failed to retrieve content from the URL: {response.status_code}") | |
# Extract items and options | |
def extract_items_with_categories(menu): | |
items_list = [] | |
for category in menu['categories']: | |
category_name = category['name'] | |
category_position = category['position'] | |
for item_code in category['items']: | |
item = next((item for item in menu['items'] if item['itemCode'] == item_code), None) | |
if item: | |
items_list.append({ | |
'category': category_name, | |
'item': item['name'], | |
'itemCode': item['itemCode'], | |
'item-position': item['position'], | |
'img-url': item['image'], | |
'price': item['price'], | |
'Description': item['itemDesc'], | |
'position': category_position, | |
}) | |
return items_list | |
def extract_options(menu): | |
options_list = [] | |
for item in menu['items']: | |
if 'modifiers' in item: | |
for modifier_code in item['modifiers']: | |
modifier = next((modifier for modifier in menu['modifiers'] if modifier['modifierCode'] == modifier_code), None) | |
if modifier: | |
for option in modifier['options']: | |
option_item = next((item for item in menu['items'] if item['itemCode'] == option['itemCode']), None) | |
if option_item: | |
options_list.append({ | |
'itemCode': item['itemCode'], | |
'itemName': item['name'], | |
'Option Group Name': modifier['name'], | |
'Min': modifier.get('minTotalOptions'), | |
'Max': modifier.get('maxTotalOptions'), | |
'Option name': option_item['name'], | |
'Option price': option['price'] | |
}) | |
return options_list | |
items_list = extract_items_with_categories(json_data['data']['menu']) | |
options_list = extract_options(json_data['data']['menu']) | |
def join_with_newline(x): | |
return '\n'.join(str(value) for value in x) | |
# Creating DataFrames with 'itemCode' column | |
df_items = pd.DataFrame(items_list) | |
df_options = pd.DataFrame(options_list) | |
# Merge the DataFrames based on 'itemCode' | |
final_df = pd.merge(df_items, df_options, on='itemCode', how='left') | |
final_df = final_df.groupby(['item', 'category'], as_index=False).agg({ | |
'Option Group Name': join_with_newline, | |
'Option name': join_with_newline, | |
'Option price': join_with_newline, | |
'Min': join_with_newline, | |
'Max': join_with_newline, | |
'item-position': 'first', | |
'img-url': 'first', | |
'price': 'first', | |
'Description': 'first', | |
'item-position': 'first', | |
'position': 'first' | |
}) | |
final_df = final_df.drop_duplicates().reset_index(drop=True) | |
def split_rows(df): | |
rows = [] | |
for idx, row in df.iterrows(): | |
min_values = row['Min'].split('\n') | |
max_values = row['Max'].split('\n') | |
option_groups = row['Option Group Name'].split('\n') | |
option_names = row['Option name'].split('\n') | |
option_prices = row['Option price'].split('\n') | |
for i in range(len(option_groups)): | |
current_row = { | |
'category': row['category'], | |
'item': row['item'], | |
'item-position': row['item-position'], | |
'img-url': row['img-url'], | |
'price': row['price'], | |
'Description': row['Description'], | |
'position': row['position'], | |
'Min': min_values[i] if i < len(min_values) else '', | |
'Max': max_values[i] if i < len(max_values) else '', | |
'Option Group Name': option_groups[i] if i < len(option_groups) else '', | |
'Option name': option_names[i] if i < len(option_names) else '', | |
'Option price': option_prices[i] if i < len(option_prices) else '' | |
} | |
rows.append(current_row) | |
return pd.DataFrame(rows) | |
split_df = split_rows(final_df) | |
split_df.fillna('', inplace=True) | |
split_df = split_df.replace('nan', '') | |
split_df.to_excel(f'{outlet_code}_1.xlsx', index=False) | |
# Final processing to JSON | |
df = pd.read_excel(f'{outlet_code}_1.xlsx') | |
json_data = df.to_json(orient='records') | |
with open(f'{outlet_code}_1.json', 'w') as f: | |
f.write(json_data) | |
# Process JSON data for final output | |
with open(f'{outlet_code}_1.json', 'r') as file: | |
data = json.load(file) | |
def process_item(item): | |
common_fields = { | |
'category': item['category'], | |
'category-position': item['position'], | |
'item': item['item'], | |
'item-position': item['item-position'], | |
'Description': item['Description'], | |
'img-url': item['img-url'], | |
'price': item['price'] | |
} | |
options = [] | |
for i in range(1, 38): | |
option_group_name = f"Option Group {i} Name" | |
option_name_key = f"Option {i} Name" | |
option_price_key = f"Option {i} Price" | |
min_values_key = f"Min{i}" | |
max_values_key = f"Max{i}" | |
if option_group_name in item and item[option_group_name]: | |
options.append({ | |
'Option Group Name': item[option_group_name], | |
'Option Name': item[option_name_key], | |
'Option Price': item[option_price_key], | |
'Min': item[min_values_key], | |
'Max': item[max_values_key] | |
}) | |
return {**common_fields, 'Options': options} | |
processed_items = [process_item(item) for item in data] | |
with open(f'processed_items_{outlet_code}.json', 'w') as outfile: | |
json.dump(processed_items, outfile, indent=4) | |
# Extract options | |
def extract_options(item): | |
option_data = {} | |
for option in item["Options"]: | |
option_group_name = option["Option Group Name"] | |
if option_group_name not in option_data: | |
option_data[option_group_name] = { | |
"Names": [], | |
"Prices": [], | |
"Mins": [], | |
"Maxs": [], | |
"GroupMin": option["Min"], | |
"GroupMax": option["Max"] | |
} | |
option_data[option_group_name]["Names"].append(option["Option Name"]) | |
option_data[option_group_name]["Prices"].append(option["Option Price"]) | |
option_data[option_group_name]["Mins"].append(option["Min"]) | |
option_data[option_group_name]["Maxs"].append(option["Max"]) | |
return option_data | |
with open(f'processed_items_{outlet_code}.json', 'r') as file: | |
data = json.load(file) | |
all_rows_data = [] | |
for item in data: | |
item_info = { | |
'category': item['category'], | |
'category-position': item['category-position'], | |
'item': item['item'], | |
'item-position': item['item-position'], | |
'Description': item['Description'], | |
'img-url': item['img-url'], | |
'price': item['price'] | |
} | |
options = extract_options(item) | |
max_options = 0 | |
for group in options.values(): | |
max_options = max(max_options, len(group["Names"])) | |
for i in range(max_options): | |
row = item_info.copy() | |
for group_name, group_data in options.items(): | |
row[f"Option Group {group_name}"] = group_name | |
row[f"Option {group_name} Name"] = group_data["Names"][i] if i < len(group_data["Names"]) else "" | |
row[f"Option {group_name} Price"] = group_data["Prices"][i] if i < len(group_data["Prices"]) else "" | |
row[f"Min {group_name}"] = group_data["Mins"][i] if i < len(group_data["Mins"]) else "" | |
row[f"Max {group_name}"] = group_data["Maxs"][i] if i < len(group_data["Maxs"]) else "" | |
all_rows_data.append(row) | |
final_df = pd.DataFrame(all_rows_data) | |
final_df.to_excel(f'{outlet_code}_final_output.xlsx', index=False) | |
# MongoDB setup | |
mongo_host = st.secrets["mongo"]["host"] | |
mongo_username = st.secrets["mongo"]["username"] | |
mongo_password = st.secrets["mongo"]["password"] | |
# Connect to MongoDB | |
connection_str = f"mongodb+srv://{mongo_username}:{mongo_password}@{mongo_host}/test?retryWrites=true&w=majority" | |
me.connect(host=connection_str, tlsCAFile=certifi.where()) | |
class Item(me.Document): | |
category = me.StringField() | |
category_position = me.IntField() | |
item = me.StringField() | |
item_position = me.IntField() | |
description = me.StringField() | |
img_url = me.StringField() | |
price = me.FloatField() | |
options = me.ListField() | |
items_collection = [] | |
for index, row in final_df.iterrows(): | |
options = [] | |
for i in range(1, 38): | |
option_group = f"Option Group {i}" | |
option_name_key = f"Option {i} Name" | |
option_price_key = f"Option {i} Price" | |
min_key = f"Min {i}" | |
max_key = f"Max {i}" | |
if pd.notna(row[option_group]) and row[option_group]: | |
options.append({ | |
"group_name": row[option_group], | |
"name": row[option_name_key], | |
"price": row[option_price_key], | |
"min": row[min_key], | |
"max": row[max_key] | |
}) | |
item_doc = Item( | |
category=row['category'], | |
category_position=row['category-position'], | |
item=row['item'], | |
item_position=row['item-position'], | |
description=row['Description'], | |
img_url=row['img-url'], | |
price=row['price'], | |
options=options | |
) | |
items_collection.append(item_doc) | |
if items_collection: | |
Item.objects.insert(items_collection) | |
st.success("Data has been saved to MongoDB.") | |
else: | |
st.warning("No data to save to MongoDB.") | |