import pandas as pd import requests import json from bs4 import BeautifulSoup import streamlit as st import mongoengine as me import certifi # Streamlit app setup st.title("Restaurant Menu Data Extraction") url1 = st.text_input("Please enter the restaurant URL:") if url1: # Extract outlet code from URL outlet_code = url1.split('/')[-2] st.write(f"Extracted outlet code: {outlet_code}") # API request setup url = "https://food.noon.com/_svc/mp-food-api-mpnoon/consumer/restaurant/outlet/details/guest" payload = { "addressLat": 244538840, "addressLng": 543773438, "deliveryType": "default", "outletCode": outlet_code } headers = { 'Connection': 'keep-alive', "Accept": "application/json, text/plain, */*", "Accept-Encoding": "gzip, deflate, br, zstd", "Accept-Language": "en-GB,en-US;q=0.9,en;q=0.8,gu;q=0.7", "Cache-Control": "no-cache, max-age=0, must-revalidate, no-store", "Content-Type": "application/json", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36", "Cookie":"bm_mi=791533C8E67CE8E7DA98E80ADED70F69~YAAQRK0cuOep9tGPAQAAUYKw3RcGDAVhD+mtWU8IH76wZL29zl4qqCjMwGv8sKtYlQWZNaFftSOvHFOvQU4+3CY2uHZyjjK6I3GeNdKEn+XHupISeNc0K16GOXLqcPOwu4sADTmxE7PYQvSQE7eimhqsBiJVRd96R8W0D2hl31FlY/4rl+NPZvM3iXjrn2GO50VMv+HhGfCnDMBwApBxgpMWFLfs0u6EYy44mg/FXbom5s5pa3cro8AS35nYHbdUbi61K9fnWRVaF8L/4z0xh7V1AEQETevb5fdGF8aB9m2UG29p2W6KSMb8DyFZLpG3vl5+IRECqZdFxaUMnykO8G/ynRHG~1; Domain=.noon.com; Path=/; Expires=Mon, 03 Jun 2024 12:41:22 GMT; Max-Age=7199; Secure" } response = requests.post(url, headers=headers, json=payload) if response.status_code == 200: json_data = response.json() with open(f'{outlet_code}.json', 'w') as json_file: json.dump(json_data, json_file, indent=4) st.success("JSON data has been written to json file.") else: st.error(f"Failed to retrieve content from the URL: {response.status_code}") # Extract items and options def extract_items_with_categories(menu): items_list = [] for category in menu['categories']: category_name = category['name'] category_position = category['position'] for item_code in category['items']: item = next((item for item in menu['items'] if item['itemCode'] == item_code), None) if item: items_list.append({ 'category': category_name, 'item': item['name'], 'itemCode': item['itemCode'], 'item-position': item['position'], 'img-url': item['image'], 'price': item['price'], 'Description': item['itemDesc'], 'position': category_position, }) return items_list def extract_options(menu): options_list = [] for item in menu['items']: if 'modifiers' in item: for modifier_code in item['modifiers']: modifier = next((modifier for modifier in menu['modifiers'] if modifier['modifierCode'] == modifier_code), None) if modifier: for option in modifier['options']: option_item = next((item for item in menu['items'] if item['itemCode'] == option['itemCode']), None) if option_item: options_list.append({ 'itemCode': item['itemCode'], 'itemName': item['name'], 'Option Group Name': modifier['name'], 'Min': modifier.get('minTotalOptions'), 'Max': modifier.get('maxTotalOptions'), 'Option name': option_item['name'], 'Option price': option['price'] }) return options_list items_list = extract_items_with_categories(json_data['data']['menu']) options_list = extract_options(json_data['data']['menu']) def join_with_newline(x): return '\n'.join(str(value) for value in x) # Creating DataFrames with 'itemCode' column df_items = pd.DataFrame(items_list) df_options = pd.DataFrame(options_list) # Merge the DataFrames based on 'itemCode' final_df = pd.merge(df_items, df_options, on='itemCode', how='left') final_df = final_df.groupby(['item', 'category'], as_index=False).agg({ 'Option Group Name': join_with_newline, 'Option name': join_with_newline, 'Option price': join_with_newline, 'Min': join_with_newline, 'Max': join_with_newline, 'item-position': 'first', 'img-url': 'first', 'price': 'first', 'Description': 'first', 'item-position': 'first', 'position': 'first' }) final_df = final_df.drop_duplicates().reset_index(drop=True) def split_rows(df): rows = [] for idx, row in df.iterrows(): min_values = row['Min'].split('\n') max_values = row['Max'].split('\n') option_groups = row['Option Group Name'].split('\n') option_names = row['Option name'].split('\n') option_prices = row['Option price'].split('\n') for i in range(len(option_groups)): current_row = { 'category': row['category'], 'item': row['item'], 'item-position': row['item-position'], 'img-url': row['img-url'], 'price': row['price'], 'Description': row['Description'], 'position': row['position'], 'Min': min_values[i] if i < len(min_values) else '', 'Max': max_values[i] if i < len(max_values) else '', 'Option Group Name': option_groups[i] if i < len(option_groups) else '', 'Option name': option_names[i] if i < len(option_names) else '', 'Option price': option_prices[i] if i < len(option_prices) else '' } rows.append(current_row) return pd.DataFrame(rows) split_df = split_rows(final_df) split_df.fillna('', inplace=True) split_df = split_df.replace('nan', '') split_df.to_excel(f'{outlet_code}_1.xlsx', index=False) # Final processing to JSON df = pd.read_excel(f'{outlet_code}_1.xlsx') json_data = df.to_json(orient='records') with open(f'{outlet_code}_1.json', 'w') as f: f.write(json_data) # Process JSON data for final output with open(f'{outlet_code}_1.json', 'r') as file: data = json.load(file) def process_item(item): common_fields = { 'category': item['category'], 'category-position': item['position'], 'item': item['item'], 'item-position': item['item-position'], 'Description': item['Description'], 'img-url': item['img-url'], 'price': item['price'] } options = [] for i in range(1, 38): option_group_name = f"Option Group {i} Name" option_name_key = f"Option {i} Name" option_price_key = f"Option {i} Price" min_values_key = f"Min{i}" max_values_key = f"Max{i}" if option_group_name in item and item[option_group_name]: options.append({ 'Option Group Name': item[option_group_name], 'Option Name': item[option_name_key], 'Option Price': item[option_price_key], 'Min': item[min_values_key], 'Max': item[max_values_key] }) return {**common_fields, 'Options': options} processed_items = [process_item(item) for item in data] with open(f'processed_items_{outlet_code}.json', 'w') as outfile: json.dump(processed_items, outfile, indent=4) # Extract options def extract_options(item): option_data = {} for option in item["Options"]: option_group_name = option["Option Group Name"] if option_group_name not in option_data: option_data[option_group_name] = { "Names": [], "Prices": [], "Mins": [], "Maxs": [], "GroupMin": option["Min"], "GroupMax": option["Max"] } option_data[option_group_name]["Names"].append(option["Option Name"]) option_data[option_group_name]["Prices"].append(option["Option Price"]) option_data[option_group_name]["Mins"].append(option["Min"]) option_data[option_group_name]["Maxs"].append(option["Max"]) return option_data with open(f'processed_items_{outlet_code}.json', 'r') as file: data = json.load(file) all_rows_data = [] for item in data: item_info = { 'category': item['category'], 'category-position': item['category-position'], 'item': item['item'], 'item-position': item['item-position'], 'Description': item['Description'], 'img-url': item['img-url'], 'price': item['price'] } options = extract_options(item) max_options = 0 for group in options.values(): max_options = max(max_options, len(group["Names"])) for i in range(max_options): row = item_info.copy() for group_name, group_data in options.items(): row[f"Option Group {group_name}"] = group_name row[f"Option {group_name} Name"] = group_data["Names"][i] if i < len(group_data["Names"]) else "" row[f"Option {group_name} Price"] = group_data["Prices"][i] if i < len(group_data["Prices"]) else "" row[f"Min {group_name}"] = group_data["Mins"][i] if i < len(group_data["Mins"]) else "" row[f"Max {group_name}"] = group_data["Maxs"][i] if i < len(group_data["Maxs"]) else "" all_rows_data.append(row) final_df = pd.DataFrame(all_rows_data) final_df.to_excel(f'{outlet_code}_final_output.xlsx', index=False) # MongoDB setup mongo_host = st.secrets["mongo"]["host"] mongo_username = st.secrets["mongo"]["username"] mongo_password = st.secrets["mongo"]["password"] # Connect to MongoDB connection_str = f"mongodb+srv://{mongo_username}:{mongo_password}@{mongo_host}/test?retryWrites=true&w=majority" me.connect(host=connection_str, tlsCAFile=certifi.where()) class Item(me.Document): category = me.StringField() category_position = me.IntField() item = me.StringField() item_position = me.IntField() description = me.StringField() img_url = me.StringField() price = me.FloatField() options = me.ListField() items_collection = [] for index, row in final_df.iterrows(): options = [] for i in range(1, 38): option_group = f"Option Group {i}" option_name_key = f"Option {i} Name" option_price_key = f"Option {i} Price" min_key = f"Min {i}" max_key = f"Max {i}" if pd.notna(row[option_group]) and row[option_group]: options.append({ "group_name": row[option_group], "name": row[option_name_key], "price": row[option_price_key], "min": row[min_key], "max": row[max_key] }) item_doc = Item( category=row['category'], category_position=row['category-position'], item=row['item'], item_position=row['item-position'], description=row['Description'], img_url=row['img-url'], price=row['price'], options=options ) items_collection.append(item_doc) if items_collection: Item.objects.insert(items_collection) st.success("Data has been saved to MongoDB.") else: st.warning("No data to save to MongoDB.")