Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,295 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import requests
|
2 |
+
from bs4 import BeautifulSoup
|
3 |
+
import json
|
4 |
+
|
5 |
+
base_url = "https://www.carfind.co.za/cars-for-sale"
|
6 |
+
data = []
|
7 |
+
|
8 |
+
# Iterate through the first 10 pages
|
9 |
+
for page_num in range(1, 3):
|
10 |
+
url = f"{base_url}/page{page_num}"
|
11 |
+
response = requests.get(url)
|
12 |
+
soup = BeautifulSoup(response.content, "html.parser")
|
13 |
+
car_listings = soup.find_all("div", class_="center", style="width:900px")
|
14 |
+
|
15 |
+
for listing in car_listings:
|
16 |
+
car_data = {}
|
17 |
+
title_div = listing.find("div", style="margin-top:8px;font-weight:bold;color:#4A75BC;font-size:24px;")
|
18 |
+
if title_div:
|
19 |
+
car_data["title"] = title_div.text.strip()
|
20 |
+
price_div = listing.find("div", class_="h1")
|
21 |
+
if price_div:
|
22 |
+
car_data["price"] = price_div.text.strip()
|
23 |
+
installment_div = listing.find("div", style="font-weight:bolder;color:#4A75BC;white-space:nowrap;margin-top:8px;display: flex;align-items: flex-end;")
|
24 |
+
if installment_div:
|
25 |
+
car_data["installment"] = installment_div.text.strip()
|
26 |
+
status_div = listing.find("div", style="margin-top:20px;font-weight:bolder;color:#4A75BC;font-size:20px")
|
27 |
+
if status_div:
|
28 |
+
car_data["status"] = status_div.text.strip()
|
29 |
+
year_div = listing.find("div", style="margin-top:20px;align-items:center", class_="search_row2")
|
30 |
+
if year_div:
|
31 |
+
year_img = year_div.find("img", alt=True)
|
32 |
+
if year_img and "Year" in year_img["alt"]:
|
33 |
+
car_data["year"] = year_img["alt"].split(" ")[-1]
|
34 |
+
mileage_div = year_div.find_next_sibling("div")
|
35 |
+
if mileage_div:
|
36 |
+
mileage_img = mileage_div.find("img", alt=True)
|
37 |
+
if mileage_img and "Mileage" in mileage_img["alt"]:
|
38 |
+
car_data["mileage"] = mileage_img["alt"].split(" ")[-1]
|
39 |
+
transmission_div = mileage_div.find_next_sibling("div")
|
40 |
+
if transmission_div:
|
41 |
+
transmission_img = transmission_div.find("img", alt=True)
|
42 |
+
if transmission_img and "Transmission" in transmission_img["alt"]:
|
43 |
+
car_data["transmission"] = transmission_img["alt"].split(" ")[-1]
|
44 |
+
dealer_div = listing.find("div", style="margin-top:20px;color:#4A75BC;font-size:20px;")
|
45 |
+
if dealer_div:
|
46 |
+
car_data["dealer"] = dealer_div.text.strip()
|
47 |
+
location_div = listing.find("div", style="display:flex;align-items:center;margin-top:20px;color:#4A75BC")
|
48 |
+
if location_div:
|
49 |
+
location_img = location_div.find("img", alt=True)
|
50 |
+
if location_img and "Location" in location_img["alt"]:
|
51 |
+
car_data["location"] = location_img["alt"].split(" ")[-1]
|
52 |
+
|
53 |
+
# Extract the image URL
|
54 |
+
image_div = listing.find("div", style="min-height:242px;position:relative")
|
55 |
+
if image_div:
|
56 |
+
img_tag = image_div.find("img", style="width:485px;max-height:365px")
|
57 |
+
if img_tag and img_tag["src"]:
|
58 |
+
car_data["image_url"] = img_tag["src"]
|
59 |
+
|
60 |
+
link_tag = listing.find("a", href=True)
|
61 |
+
if link_tag:
|
62 |
+
car_link = "https://www.carfind.co.za" + link_tag["href"]
|
63 |
+
car_data["link"] = car_link
|
64 |
+
|
65 |
+
# Now follow the link to get more details
|
66 |
+
details_response = requests.get(car_link)
|
67 |
+
details_soup = BeautifulSoup(details_response.content, "html.parser")
|
68 |
+
|
69 |
+
# Extract additional details from the details page
|
70 |
+
main_info_div = details_soup.find("div", id="maininfo")
|
71 |
+
if main_info_div:
|
72 |
+
description_div = main_info_div.find("div", style="margin-top:10px;font-size:14px;color:#5C5C5C;font-weight:bold")
|
73 |
+
if description_div:
|
74 |
+
car_data["description"] = description_div.text.strip()
|
75 |
+
|
76 |
+
# Extract other specific details like Body Type, Colour, Engine Size, etc.
|
77 |
+
overview_div = details_soup.find("div", id="overview_div")
|
78 |
+
if overview_div:
|
79 |
+
details_rows = overview_div.find_all("div", class_="vdpoverviewrow")
|
80 |
+
for row in details_rows:
|
81 |
+
header = row.find("div", class_="financeheader")
|
82 |
+
value = row.find("div", class_="bold")
|
83 |
+
if header and value:
|
84 |
+
car_data[header.text.strip()] = value.text.strip()
|
85 |
+
|
86 |
+
data.append(car_data)
|
87 |
+
|
88 |
+
# Save the data to a JSON file
|
89 |
+
with open('/content/car_data.json', 'w') as json_file:
|
90 |
+
json.dump(data, json_file, indent=4, ensure_ascii=False)
|
91 |
+
|
92 |
+
import json
|
93 |
+
|
94 |
+
# Define the mapping of old keys to new keys
|
95 |
+
key_mapping = {
|
96 |
+
"Body Type": "BodyType",
|
97 |
+
"Driving Wheels": "DrivingWheels",
|
98 |
+
"Engine Size": "EngineSize",
|
99 |
+
"Fuel Type": "FuelType",
|
100 |
+
"Gearbox Type": "GearboxType"
|
101 |
+
}
|
102 |
+
|
103 |
+
# Function to rename keys recursively in a JSON object
|
104 |
+
def rename_keys(obj, mapping):
|
105 |
+
if isinstance(obj, dict):
|
106 |
+
new_obj = {}
|
107 |
+
for key, value in obj.items():
|
108 |
+
new_key = mapping.get(key, key) # Use mapped key if found, else use original key
|
109 |
+
new_obj[new_key] = rename_keys(value, mapping)
|
110 |
+
return new_obj
|
111 |
+
elif isinstance(obj, list):
|
112 |
+
return [rename_keys(item, mapping) for item in obj]
|
113 |
+
else:
|
114 |
+
return obj
|
115 |
+
|
116 |
+
# Path to the input JSON file
|
117 |
+
input_json_path = '/content/car_data.json'
|
118 |
+
|
119 |
+
# Path to save the modified JSON file
|
120 |
+
output_json_path = '/content/car_dataformatted.json'
|
121 |
+
|
122 |
+
# Read the JSON file
|
123 |
+
with open(input_json_path, 'r') as file:
|
124 |
+
data = json.load(file)
|
125 |
+
|
126 |
+
# Rename keys
|
127 |
+
modified_data = rename_keys(data, key_mapping)
|
128 |
+
|
129 |
+
# Save the modified JSON to a new file
|
130 |
+
with open(output_json_path, 'w') as file:
|
131 |
+
json.dump(modified_data, file, indent=4)
|
132 |
+
|
133 |
+
print(f"Modified JSON saved to {output_json_path}")
|
134 |
+
import json
|
135 |
+
import concurrent.futures
|
136 |
+
from gradio_client import Client
|
137 |
+
import httpx
|
138 |
+
|
139 |
+
# Function to fetch AI response with timeout handling
|
140 |
+
def fetch_ai_response(client, title):
|
141 |
+
try:
|
142 |
+
result = client.predict(
|
143 |
+
message=f"Provide me with details of a {title} in the metric system",
|
144 |
+
api_name="/chat"
|
145 |
+
)
|
146 |
+
return result
|
147 |
+
except httpx.TimeoutException:
|
148 |
+
return "Timeout occurred"
|
149 |
+
except Exception as e:
|
150 |
+
return f"Error: {e}"
|
151 |
+
|
152 |
+
# Initialize Gradio Client
|
153 |
+
client = Client("IAMTFRMZA/Groq-llama-3-chatbot_70b")
|
154 |
+
|
155 |
+
# Load existing JSON data
|
156 |
+
with open("/content/car_dataformatted.json", "r") as json_file:
|
157 |
+
car_listings = json.load(json_file)
|
158 |
+
|
159 |
+
# Use concurrent processing to fetch AI responses
|
160 |
+
with concurrent.futures.ThreadPoolExecutor() as executor:
|
161 |
+
futures = []
|
162 |
+
for listing in car_listings:
|
163 |
+
title = listing["title"]
|
164 |
+
futures.append(executor.submit(fetch_ai_response, client, title))
|
165 |
+
|
166 |
+
# Retrieve results from futures
|
167 |
+
for idx, future in enumerate(concurrent.futures.as_completed(futures)):
|
168 |
+
try:
|
169 |
+
result = future.result()
|
170 |
+
car_listings[idx]["AskAI"] = result
|
171 |
+
except Exception as exc:
|
172 |
+
print(f"Exception occurred: {exc}")
|
173 |
+
|
174 |
+
# Save updated listings back to JSON file
|
175 |
+
with open("/content/car_dataai.json", "w") as json_file:
|
176 |
+
json.dump(car_listings, json_file, indent=4, ensure_ascii=False)
|
177 |
+
|
178 |
+
print("JSON file updated successfully with AI responses.")
|
179 |
+
import gradio as gr
|
180 |
+
import json
|
181 |
+
from sentence_transformers import SentenceTransformer, util
|
182 |
+
|
183 |
+
# Load the car data
|
184 |
+
with open('/content/car_dataai.json', 'r') as f:
|
185 |
+
car_data = json.load(f)
|
186 |
+
|
187 |
+
# Function to normalize key names
|
188 |
+
def normalize_key(key):
|
189 |
+
return key.strip().lower().replace(' ', '_')
|
190 |
+
|
191 |
+
# Prepare car listings with additional fields
|
192 |
+
car_listings = []
|
193 |
+
for car in car_data:
|
194 |
+
description = f"{car.get('title')} - {car.get('price')} - {car.get('status')} - {car.get('link')} - {car.get('year')} - {car.get('location')} - {car.get('image_url')} - {car.get('mileage')} - {car.get('BodyType')} - {car.get('colour')} - {car.get('DrivingWheels')} - {car.get('EngineSize')} - {car.get('FuelType')} - {car.get('GearboxType')} - {car.get('power')} - {car.get('seats')}"
|
195 |
+
car_listings.append(description.strip())
|
196 |
+
|
197 |
+
# Load a pre-trained Sentence Transformer model
|
198 |
+
model_name = 'paraphrase-MiniLM-L6-v2'
|
199 |
+
model = SentenceTransformer(model_name)
|
200 |
+
|
201 |
+
# Vectorize the car listings
|
202 |
+
car_embeddings = model.encode(car_listings, convert_to_tensor=True)
|
203 |
+
|
204 |
+
def search_cars(query):
|
205 |
+
# Vectorize the query
|
206 |
+
query_embedding = model.encode(query, convert_to_tensor=True)
|
207 |
+
|
208 |
+
# Calculate cosine similarity between the query and each car listing
|
209 |
+
cos_scores = util.pytorch_cos_sim(query_embedding, car_embeddings)[0]
|
210 |
+
|
211 |
+
# Normalize query to lower case and remove spaces for comparison
|
212 |
+
query_normalized = query.replace(" ", "").lower()
|
213 |
+
|
214 |
+
# Adjust boost logic and thresholds
|
215 |
+
boosted_scores = []
|
216 |
+
for score, car in zip(cos_scores, car_data):
|
217 |
+
boost = 0
|
218 |
+
# Normalize fields for comparison
|
219 |
+
normalized_fields = {normalize_key(field): car.get(field, '').replace(" ", "").lower() for field in ['title', 'price', 'status', 'year', 'location']}
|
220 |
+
|
221 |
+
# Check if query term is in any relevant field
|
222 |
+
if any(query_normalized in normalized_fields[field] for field in normalized_fields):
|
223 |
+
boost = 0.8 # Increase boost for better differentiation
|
224 |
+
|
225 |
+
boosted_scores.append(score.item() + boost)
|
226 |
+
|
227 |
+
# Combine the scores with their respective car descriptions and sort
|
228 |
+
results = list(zip(boosted_scores, car_listings, car_data))
|
229 |
+
results = sorted(results, key=lambda x: x[0], reverse=True)
|
230 |
+
|
231 |
+
# Separate results into two tables based on the score threshold
|
232 |
+
high_score_results = [r for r in results if r[0] >= 0.5] # Adjust threshold as needed
|
233 |
+
low_score_results = [r for r in results if 0.1 <= r[0] < 0.5] # Adjust lower score threshold
|
234 |
+
|
235 |
+
# Format results for display as cards
|
236 |
+
response_html = ""
|
237 |
+
|
238 |
+
# High score results cards
|
239 |
+
if high_score_results:
|
240 |
+
response_html += "<h2>Here are your results</h2>"
|
241 |
+
response_html += "<div style='display:flex; flex-wrap:wrap;'>"
|
242 |
+
for score, car_description, car_info in high_score_results:
|
243 |
+
response_html += "<div style='border: 1px solid #ddd; border-radius: 5px; padding: 10px; margin: 10px; width: 300px;'>"
|
244 |
+
response_html += f"<img src='{car_info.get('image_url')}' width='100' style='float:left; margin-right: 10px;'>"
|
245 |
+
response_html += f"<h3>{car_info.get('title')}</h3>"
|
246 |
+
response_html += f"<p><b>Price:</b> {car_info.get('price')}</p>"
|
247 |
+
response_html += f"<p><b>Status:</b> {car_info.get('status')}</p>"
|
248 |
+
response_html += f"<p><b>Body Type:</b> {car_info.get('BodyType')}</p>"
|
249 |
+
response_html += f"<p><b>Year:</b> {car_info.get('year')}</p>"
|
250 |
+
response_html += f"<p><b>Mileage:</b> {car_info.get('Mileage')}</p>"
|
251 |
+
response_html += f"<p><b>Location:</b> {car_info.get('location')}</p>"
|
252 |
+
response_html += f"<a href='{car_info.get('link')}' target='_blank'>View Listing</a>"
|
253 |
+
response_html += f"<p><b>Similarity Score:</b> {score:.4f}</p>"
|
254 |
+
response_html += "</div>"
|
255 |
+
response_html += "</div>"
|
256 |
+
|
257 |
+
# Low score results cards
|
258 |
+
if low_score_results:
|
259 |
+
response_html += "<h2>Some more results you might be interested in</h2>"
|
260 |
+
response_html += "<div style='display:flex; flex-wrap:wrap;'>"
|
261 |
+
for score, car_description, car_info in low_score_results:
|
262 |
+
response_html += "<div style='border: 1px solid #ddd; border-radius: 5px; padding: 10px; margin: 10px; width: 300px;'>"
|
263 |
+
response_html += f"<img src='{car_info.get('image_url')}' width='100' style='float:left; margin-right: 10px;'>"
|
264 |
+
response_html += f"<h3>{car_info.get('title')}</h3>"
|
265 |
+
response_html += f"<p><b>Price:</b> {car_info.get('price')}</p>"
|
266 |
+
response_html += f"<p><b>Status:</b> {car_info.get('status')}</p>"
|
267 |
+
response_html += f"<p><b>Body Type:</b> {car_info.get('BodyType')}</p>"
|
268 |
+
response_html += f"<p><b>Year:</b> {car_info.get('year')}</p>"
|
269 |
+
response_html += f"<p><b>Mileage:</b> {car_info.get('Mileage')}</p>"
|
270 |
+
response_html += f"<p><b>Location:</b> {car_info.get('location')}</p>"
|
271 |
+
response_html += f"<a href='{car_info.get('link')}' target='_blank' style='text-decoration:none; color:#333; background-color:#f0f0f0; padding:8px 12px; border-radius:4px; margin-right: 10px;'>View Listing</a>"
|
272 |
+
response_html += f"<p><b>Similarity Score:</b> {score:.4f}</p>"
|
273 |
+
response_html += "</div>"
|
274 |
+
response_html += "</div>"
|
275 |
+
|
276 |
+
return response_html
|
277 |
+
|
278 |
+
# Create a chat interface using Gradio's Blocks
|
279 |
+
def create_chat_interface():
|
280 |
+
with gr.Blocks(theme=gr.themes.Monochrome(), fill_height=True) as demo:
|
281 |
+
with gr.Row():
|
282 |
+
gr.Markdown("## Ai Search")
|
283 |
+
with gr.Row():
|
284 |
+
query_input = gr.Textbox(label="What are you looking for?", placeholder="Please type your query here")
|
285 |
+
with gr.Row():
|
286 |
+
output_html = gr.HTML(label="Search Results")
|
287 |
+
|
288 |
+
query_input.change(fn=search_cars, inputs=query_input, outputs=output_html)
|
289 |
+
|
290 |
+
return demo
|
291 |
+
|
292 |
+
# Launch the chat interface
|
293 |
+
demo = create_chat_interface()
|
294 |
+
demo.launch()
|
295 |
+
|