Yuvalamitay commited on
Commit
546fcd4
·
verified ·
1 Parent(s): 5b751cb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +63 -102
app.py CHANGED
@@ -1,131 +1,92 @@
1
- from huggingface_hub import InferenceClient
2
- #step 1 from semantic search
3
  from sentence_transformers import SentenceTransformer
4
  import torch
5
-
6
  import gradio as gr
7
- import random
 
 
8
 
 
9
  client = InferenceClient("Qwen/Qwen2.5-72B-Instruct")
10
- #step 2 from semantic search read file
11
 
12
- # Open the water_cycle.txt file in read mode with UTF-8 encoding
13
  with open("reconext_file.txt", "r", encoding="utf-8") as file:
14
- # Read the entire contents of the file and store it in a variable
15
- reconext_file_text = file.read()
16
-
17
- # Print the text below
18
- print(reconext_file_text)
19
-
20
- #step 3 from semantix search
21
 
22
  def preprocess_text(text):
23
- # Strip extra whitespace from the beginning and the end of the text
24
- cleaned_text = text.strip()
25
-
26
- # Split the cleaned_text by every newline character (\n)
27
- chunks = cleaned_text.split("\n")
28
-
29
- # Create an empty list to store cleaned chunks
30
- cleaned_chunks = []
31
-
32
- # Write your for-in loop below to clean each chunk and add it to the cleaned_chunks list
33
- for chunk in chunks:
34
- clean_chunk = chunk.strip()
35
- if(len(clean_chunk) >= 0):
36
- cleaned_chunks.append(clean_chunk)
37
-
38
- # Print cleaned_chunks
39
- print(cleaned_chunks)
40
-
41
- # Print the length of cleaned_chunks
42
- print(len(cleaned_chunks))
43
 
44
- # Return the cleaned_chunks
45
- return cleaned_chunks
46
 
47
- # Call the preprocess_text function and store the result in a cleaned_chunks variable
48
- cleaned_chunks = preprocess_text(reconext_file_text) # Complete this line
49
-
50
- #step 4 from semantic search
51
-
52
- # Load the pre-trained embedding model that converts text to vectors
53
  model = SentenceTransformer('all-MiniLM-L6-v2')
54
 
55
  def create_embeddings(text_chunks):
56
- # Convert each text chunk into a vector embedding and store as a tensor
57
- chunk_embeddings = model.encode(text_chunks, convert_to_tensor=True) # Replace ... with the text_chunks list
58
-
59
- # Print the chunk embeddings
60
- print(chunk_embeddings)
61
-
62
- # Print the shape of chunk_embeddings
63
- print(chunk_embeddings.shape)
64
 
65
- # Return the chunk_embeddings
66
- return chunk_embeddings
67
 
68
- # Call the create_embeddings function and store the result in a new chunk_embeddings variable
69
- chunk_embeddings = create_embeddings(cleaned_chunks) # Complete this line
70
-
71
- #step 5 from semantic search
72
-
73
- # Define a function to find the most relevant text chunks for a given query, chunk_embeddings, and text_chunks
74
  def get_top_chunks(query, chunk_embeddings, text_chunks):
75
- # Convert the query text into a vector embedding
76
- query_embedding = model.encode(query, convert_to_tensor=True) # Complete this line
77
-
78
- # Normalize the query embedding to unit length for accurate similarity comparison
79
- query_embedding_normalized = query_embedding / query_embedding.norm()
80
-
81
- # Normalize all chunk embeddings to unit length for consistent comparison
82
- chunk_embeddings_normalized = chunk_embeddings / chunk_embeddings.norm(dim=1, keepdim=True)
83
-
84
- # Calculate cosine similarity between query and all chunks using matrix multiplication
85
- similarities = torch.matmul(chunk_embeddings_normalized, query_embedding_normalized) # Complete this line
86
-
87
- # Print the similarities
88
- print(similarities)
89
-
90
- # Find the indices of the 3 chunks with highest similarity scores
91
- top_indices = torch.topk(similarities, k=3).indices
92
-
93
- # Print the top indices
94
- print(top_indices)
95
-
96
- # Create an empty list to store the most relevant chunks
97
- top_chunks = []
98
-
99
- # Loop through the top indices and retrieve the corresponding text chunks
100
- for i in top_indices:
101
- top_chunks.append(text_chunks[i])
102
-
103
- # Return the list of most relevant chunks
104
- return top_chunks
105
-
 
 
 
 
 
 
106
 
 
 
107
 
108
- def respond(message, history):
109
- best_next_watch = get_top_chunks(message, chunk_embeddings, cleaned_chunks)
110
- print(best_next_watch)
111
- str_watch_chunks = "\n".join(best_next_watch)
112
  messages = [
113
- {"role":"system",
114
- "content": "You are a gen-z helpful chatbot that helps teenagers find their next best watch, speak in gen-z terms and be natural. You should answer the users question based on " + str_watch_chunks + " ."
 
115
  }
116
  ]
 
117
  if history:
118
  messages.extend(history)
119
 
120
- messages.append(
121
- {'role':'user',
122
- 'content':message}
123
- )
124
-
125
  response = client.chat_completion(
126
- messages, max_tokens = 300, temperature=1.3, top_p=0.6
127
  )
128
  return response['choices'][0]['message']['content'].strip()
129
 
130
- chatbot = gr.ChatInterface(respond, type="messages")
131
- chatbot.launch()
 
 
1
+ from huggingface_hub import InferenceClient
 
2
  from sentence_transformers import SentenceTransformer
3
  import torch
 
4
  import gradio as gr
5
+ import requests
6
+ import os
7
+ TMDB_TOKEN = os.getenv("TMDB_BEARER_TOKEN")
8
 
9
+ # Hugging Face model
10
  client = InferenceClient("Qwen/Qwen2.5-72B-Instruct")
 
11
 
12
+ # Load and clean reconext text
13
  with open("reconext_file.txt", "r", encoding="utf-8") as file:
14
+ reconext_file_text = file.read()
 
 
 
 
 
 
15
 
16
  def preprocess_text(text):
17
+ chunks = [chunk.strip() for chunk in text.strip().split("\n") if chunk.strip()]
18
+ return chunks
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
+ cleaned_chunks = preprocess_text(reconext_file_text)
 
21
 
22
+ # Convert text chunks to embeddings
 
 
 
 
 
23
  model = SentenceTransformer('all-MiniLM-L6-v2')
24
 
25
  def create_embeddings(text_chunks):
26
+ return model.encode(text_chunks, convert_to_tensor=True)
 
 
 
 
 
 
 
27
 
28
+ chunk_embeddings = create_embeddings(cleaned_chunks)
 
29
 
30
+ # Semantic search for top relevant chunks
 
 
 
 
 
31
  def get_top_chunks(query, chunk_embeddings, text_chunks):
32
+ query_embedding = model.encode(query, convert_to_tensor=True)
33
+ query_embedding_normalized = query_embedding / query_embedding.norm()
34
+ chunk_embeddings_normalized = chunk_embeddings / chunk_embeddings.norm(dim=1, keepdim=True)
35
+ similarities = torch.matmul(chunk_embeddings_normalized, query_embedding_normalized)
36
+ top_indices = torch.topk(similarities, k=3).indices
37
+ return [text_chunks[i] for i in top_indices]
38
+
39
+ # TMDB API function
40
+ def get_tmdb_recommendation(query):
41
+ url = "https://api.themoviedb.org/3/search/multi"
42
+ headers = {
43
+ "Authorization": f"Bearer {TMDB_TOKEN}"
44
+ }
45
+ params = {
46
+ "query": query,
47
+ "include_adult": False,
48
+ "language": "en-US",
49
+ "page": 1
50
+ }
51
+
52
+ response = requests.get(url, headers=headers, params=params)
53
+ if response.status_code == 200:
54
+ results = response.json().get("results", [])
55
+ if not results:
56
+ return "Nothin' popped up on TMDB for that 🫠"
57
+
58
+ top = results[0]
59
+ title = top.get("title") or top.get("name") or "a mystery show"
60
+ overview = top.get("overview", "No description available.")
61
+ return f"🔥 Try watching **{title}** — {overview}"
62
+ else:
63
+ return "TMDB ghosted us 👻 Try again later."
64
+
65
+ # Chatbot response function
66
+ def respond(message, history):
67
+ if any(word in message.lower() for word in ["recommend", "suggest", "watch", "movie", "show"]):
68
+ return get_tmdb_recommendation(message)
69
 
70
+ best_chunks = get_top_chunks(message, chunk_embeddings, cleaned_chunks)
71
+ str_chunks = "\n".join(best_chunks)
72
 
 
 
 
 
73
  messages = [
74
+ {
75
+ "role": "system",
76
+ "content": f"You are a gen-z helpful chatbot that helps teenagers find their next best watch. Speak in a chill, funny, and relatable tone. Use the info below to answer:\n{str_chunks}"
77
  }
78
  ]
79
+
80
  if history:
81
  messages.extend(history)
82
 
83
+ messages.append({'role': 'user', 'content': message})
84
+
 
 
 
85
  response = client.chat_completion(
86
+ messages, max_tokens=300, temperature=1.3, top_p=0.6
87
  )
88
  return response['choices'][0]['message']['content'].strip()
89
 
90
+ # Gradio app
91
+ chatbot = gr.ChatInterface(respond, title="📺 Gen-Z Watch Buddy")
92
+ chatbot.launch()