Spaces:
Sleeping
Sleeping
import json | |
import requests | |
from bs4 import BeautifulSoup | |
import mongoengine as me | |
import certifi | |
from fastapi import FastAPI, HTTPException | |
from pydantic import BaseModel | |
from typing import List | |
from urllib.parse import urljoin | |
import uvicorn | |
# MongoDB connection settings | |
db_name = 'Deliveroo' | |
host = 'ac-xvwfer4-shard-00-00.xrv1il9.mongodb.net' | |
port = 27017 | |
username = 'sahlhubdeliveroo' | |
password = 'vp5bLCjufKdarP1W' | |
alias = 'default' | |
me.disconnect(alias) | |
me.connect(db_name, host=host, port=port, username=username, password=password, ssl=True, tlsCAFile=certifi.where()) | |
# Define the MongoDB document models | |
class DeliverooLink(me.Document): | |
href = me.StringField(required=True) | |
class MenuOption(me.Document): | |
name = me.StringField(required=True) | |
price = me.StringField(required=True) | |
option_groups = me.ListField(me.ReferenceField('MenuOptionGroup')) | |
location = me.StringField(required=True) | |
restro_ref_id = me.ReferenceField(DeliverooLink, required=True) | |
class MenuOptionGroup(me.Document): | |
group_name = me.StringField(required=True) | |
options = me.ListField(me.ReferenceField(MenuOption)) | |
min_quantity = me.IntField(required=True) | |
max_quantity = me.IntField(required=True) | |
location = me.StringField(required=True) | |
restro_ref_id = me.ReferenceField(DeliverooLink, required=True) | |
class MenuItem(me.Document): | |
category_name = me.StringField(required=True) | |
name = me.StringField(required=True) | |
description = me.StringField() | |
image_url = me.StringField() | |
price = me.StringField(required=True) | |
option_groups = me.ListField(me.ReferenceField(MenuOptionGroup)) | |
location = me.StringField(required=True) | |
restro_ref_id = me.ReferenceField(DeliverooLink, required=True) | |
class MenuCategory(me.Document): | |
name = me.StringField(required=True) | |
items = me.ListField(me.ReferenceField(MenuItem)) | |
location = me.StringField(required=True) | |
restro_ref_id = me.ReferenceField(DeliverooLink, required=True) | |
# FastAPI app | |
# app = FastAPI() | |
# Pydantic model for input validation | |
# class LocationRequest(BaseModel): | |
# city: str | |
# location: str | |
# @app.post("/fetch_and_process_data") | |
# async def fetch_and_process_data(request: LocationRequest): | |
# city = request.city.lower() | |
# location = request.location.lower() | |
# print(f"Fetching restaurant links for {city}, {location}...") | |
# links = fetch_restaurant_links(city, location) | |
# if not links: | |
# raise HTTPException(status_code=404, detail="No restaurants found for the specified location.") | |
# print(f"Found {len(links)} links. Processing restaurant data...") | |
# data = process_links(links, location) | |
# print("Saving data to MongoDB...") | |
# save_data_to_db(data, location) | |
# print("Data has been processed and saved successfully.") | |
# return {"message": "Data has been processed and saved successfully."} | |
def fetch_restaurant_links(city, location): | |
base_url = "https://deliveroo.ae" | |
url = f"{base_url}/restaurants/{city}/{location}/?collection=restaurants" | |
headers = { | |
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', | |
'Cookie': '__cf_bm=oakl46sJ3V9vwmnIIbfXWfkHbGmmC2pH56GyTI33b4U-1715931048-1.0.1.1-4XOcSGSThZV_INfpn3aptlo8jpZtLFbYoLsZxP9BpQ8LIjq3wBIe8CPlSf0AomuniXy4TZWyVlBQBTlrm.CPiSfI1jzx18y9zxwc9GX0fmo; roo_guid=c40617a7-76f7-432c-b780-f2653cd2edfe; roo_session_guid=2e989653-2776-4ede-a52e-b610f1ad64a2' | |
} | |
response = requests.get(url, headers=headers) | |
if response.status_code == 200: | |
soup = BeautifulSoup(response.content, 'html.parser') | |
if "We couldn't find" in soup.text or "No restaurants" in soup.text: | |
print("No restaurants found for the specified location.") | |
return [] | |
divs = soup.find_all('div', class_=["HomeFeedScrollTracker-bd9a6ffea8a4b4b7", "HomeFeedUICard-157f7be5d7b2fa7b"]) | |
hrefs = [a_tag['href'] for div in divs for a_tag in div.find_all('a', href=True)] | |
print(hrefs) | |
hrefs = hrefs[:20] | |
full_urls = [] | |
for href in hrefs: | |
full_url = urljoin(base_url, href) | |
link = DeliverooLink(href=full_url) | |
link.save() | |
full_urls.append(full_url) | |
return full_urls | |
else: | |
print("Response timed out.") | |
return [] | |
def fetch_restaurant_data(url, location): | |
headers = { | |
'Cookie': '__cf_bm=_AOZtAiObnqBHPy4zhGRgBLW9xg9WiaDCRzg5E0sbMk-1715757967-1.0.1.1-xZNMBsnAqy_tfjUveujgfzT4Usw5ur4u7L0JlCcNXAQIC6Cq6wj46vPH7RLTh0Gq90JENxl7kbzjyOUFaBr8yCkmRGmt7APITEk0kkXzLTs; roo_guid=c40617a7-76f7-432c-b780-f2653cd2edfe; roo_session_guid=5846d6f0-5b7f-4598-8c6d-82b8023fd4fc' | |
} | |
response = requests.get(url, headers=headers) | |
if response.status_code != 200: | |
print(f"Failed to fetch the URL: {url}") | |
return None | |
soup = BeautifulSoup(response.content, 'html.parser') | |
script_tag = soup.find('script', id='__NEXT_DATA__') | |
if not script_tag: | |
print("Script tag not found") | |
return None | |
json_data = json.loads(script_tag.string) | |
json_data = json_data['props']['initialState']['menuPage']['menu']['meta'] | |
items = json_data['items'] | |
categories = json_data['categories'] | |
category_map = {category['id']: category['name'] for category in categories} | |
modifier_groups = json_data['modifierGroups'] | |
"""modifier_groups_dict = {item['id']: [] for item in items} | |
for modifier_group in modifier_groups: | |
modifier_groups_dict[modifier_group['id']] = modifier_group""" | |
modifier_groups_dict = {modifier_group['id']: modifier_group for modifier_group in modifier_groups} | |
items_with_modifiers = [] | |
for item in items: | |
category_id = item['categoryId'] | |
category_name = category_map.get(category_id, 'Unknown') | |
# Skip items with the "Unknown" category | |
if category_name == 'Unknown': | |
continue | |
item_with_modifiers = { | |
"id": item['id'], | |
"category_id": category_id, | |
"category_name": category_name, | |
"name": item['name'], | |
"description": item.get('description', ''), | |
"price": item['price']['formatted'], | |
"img_url": item.get('image').get('url', '') if item.get('image') else '', | |
"modifier_groups": [modifier_groups_dict.get(modifier_group_id, {}) for modifier_group_id in item.get('modifierGroupIds', [])], | |
"location": location, | |
"restro_ref_id": url | |
} | |
items_with_modifiers.append(item_with_modifiers) | |
return items_with_modifiers | |
# def save_data_to_json(data, filename='restaurants.json'): | |
# with open(filename, 'w') as f: | |
# json.dump(data, f, indent=4) | |
def process_links(urls, location): | |
all_data = [] | |
for url in urls: | |
data = fetch_restaurant_data(url, location) | |
if data: | |
all_data.extend(data) | |
# save_data_to_json(all_data) | |
return all_data | |
def save_data_to_db(data, location): | |
# Dictionary to hold categories and associated items | |
category_dict = {} | |
for item_data in data: | |
category_name = item_data['category_name'] | |
name = item_data['name'] | |
description = item_data.get('description', '') | |
image_url = item_data.get('img_url', '') | |
price = item_data['price'] | |
option_groups_data = item_data.get('modifier_groups', []) | |
restro_ref_id_url = item_data['restro_ref_id'] | |
restro_link = DeliverooLink.objects(href=restro_ref_id_url).first() | |
if not restro_link: | |
restro_link = DeliverooLink(href=restro_ref_id_url) | |
restro_link.save() | |
# Create or retrieve MenuOption instances | |
option_groups = [] | |
for group_data in option_groups_data: | |
group_name = group_data['name'] | |
min_quantity = group_data.get('minSelection', 0) | |
max_quantity = group_data.get('maxSelection', 1) | |
options_data = group_data.get('modifierOptions', []) | |
option_group = MenuOptionGroup.objects( | |
group_name=group_name, min_quantity=min_quantity, max_quantity=max_quantity, location=location, restro_ref_id=restro_link).first() | |
if not option_group: | |
option_group = MenuOptionGroup( | |
group_name=group_name, | |
min_quantity=min_quantity, | |
max_quantity=max_quantity, | |
location=location, | |
restro_ref_id=restro_link | |
) | |
option_group.save() | |
options = [] | |
for option_data in options_data: | |
option_name = option_data['name'] | |
option_price = option_data['price']['formatted'] | |
option = MenuOption.objects(name=option_name, price=option_price, location=location, restro_ref_id=restro_link).first() | |
if not option: | |
option = MenuOption(name=option_name, price=option_price, location=location, restro_ref_id=restro_link) | |
option.save() | |
if option not in option_group.options: | |
option_group.options.append(option) | |
option_group.save() | |
if option_group not in option.option_groups: | |
option.option_groups.append(option_group) | |
option.save() | |
options.append(option) | |
option_groups.append(option_group) | |
# Create or retrieve MenuItem instance | |
menu_item = MenuItem.objects( | |
category_name=category_name, name=name, location=location, restro_ref_id=restro_link).first() | |
if not menu_item: | |
menu_item = MenuItem( | |
category_name=category_name, | |
name=name, | |
description=description, | |
image_url=image_url, | |
price=price, | |
option_groups=option_groups, | |
location=location, | |
restro_ref_id=restro_link | |
) | |
menu_item.save() | |
else: | |
# Ensure option groups are updated for existing menu items | |
for option_group in option_groups: | |
if option_group not in menu_item.option_groups: | |
menu_item.option_groups.append(option_group) | |
menu_item.save() | |
# Store the menu item under its category | |
if category_name not in category_dict: | |
category_dict[category_name] = [] | |
category_dict[category_name].append(menu_item) | |
# Create or retrieve MenuCategory instances and associate items | |
for category_name, items in category_dict.items(): | |
menu_category = MenuCategory.objects(name=category_name, location=location, restro_ref_id=restro_link).first() | |
if not menu_category: | |
menu_category = MenuCategory(name=category_name, location=location, restro_ref_id=restro_link) | |
menu_category.save() | |
for menu_item in items: | |
if menu_item not in menu_category.items: | |
menu_category.items.append(menu_item) | |
menu_category.save() | |
print(f"Saved item '{name}' in category '{category_name}' for location '{location}'") | |
# if __name__ == "__main__": | |
# uvicorn.run(app, host="0.0.0.0", port=8000) | |