Spaces:

kambris
/

SoLProject

Runtime error

kambris commited on Nov 23, 2024

Commit

1c5ddd8

verified ·

1 Parent(s): 8c80330

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -15,16 +15,12 @@ emotion_classifier = pipeline("text-classification", model=emotion_model, tokeni
 # Function to generate embeddings using AraBERT
 def generate_embeddings(texts):
-    # Ensure texts is a list of strings
-    if isinstance(texts, str):  # If single string, convert to list
-        texts = [texts]
-    # Tokenize the list of strings (ensure all are strings)
     inputs = bert_tokenizer(texts, return_tensors="pt", padding=True, truncation=False, max_length=512)
-    # Split large sequences into chunks of size 512
     chunked_inputs = []
     for input_ids in inputs['input_ids']:
         chunks = [input_ids[i:i + 512] for i in range(0, len(input_ids), 512)]
         chunked_inputs.extend(chunks)
@@ -37,8 +33,9 @@ def generate_embeddings(texts):
         chunk_embedding = outputs.last_hidden_state.mean(dim=1).numpy()
         embeddings.append(chunk_embedding)
-    # Return the embeddings averaged across chunks
-    return embeddings
 # Function to process the uploaded file and summarize by country
 def process_and_summarize(uploaded_file, top_n=50):

 # Function to generate embeddings using AraBERT
 def generate_embeddings(texts):
+    # Tokenize all the texts (poems)
     inputs = bert_tokenizer(texts, return_tensors="pt", padding=True, truncation=False, max_length=512)
     chunked_inputs = []
     for input_ids in inputs['input_ids']:
+        # Split each long sequence into chunks of max 512 tokens
         chunks = [input_ids[i:i + 512] for i in range(0, len(input_ids), 512)]
         chunked_inputs.extend(chunks)
         chunk_embedding = outputs.last_hidden_state.mean(dim=1).numpy()
         embeddings.append(chunk_embedding)
+    # Combine all embeddings (you can take the average of embeddings for each poem)
+    final_embeddings = sum(embeddings) / len(embeddings)
+    return final_embeddings
 # Function to process the uploaded file and summarize by country
 def process_and_summarize(uploaded_file, top_n=50):