Estonian-Judge

Running

App Files Files Community

E-slam commited on Sep 20, 2024

Commit

1a2217b

verified ·

1 Parent(s): 5d98430

Delete Allam_Backend_HF.py

Browse files

Files changed (1) hide show

Allam_Backend_HF.py +0 -267

Allam_Backend_HF.py DELETED Viewed

@@ -1,267 +0,0 @@
-import pandas as pd
-import faiss
-import numpy as np
-import torch
-import requests
-import os
-#import huggingface_hub
-hf_token = os.getenv("hf_token")
-#huggingface_hub.login(hf_token)
-df = pd.read_excel("Allam_SA_Articles.xlsx")
-input_texts = df['Article_text'].tolist()
-MOJ_embeddings = np.load('Allam_embeddings.npy')
-def embed_single_text(query):
-    headers = {
-        "Authorization": f"Bearer {hf_token}"
-    }
-    url = f"https://allam-llm-e5-embeddings.hf.space/e5_embeddings?query={query}"
-    response = requests.get(url, headers=headers)
-    if response.status_code == 200:
-        return torch.tensor(response.json())
-    else:
-        print(f"Error: {response.status_code}")
-        return None
-#Faiss
-dimension = MOJ_embeddings.shape[1]
-index = faiss.IndexFlatIP(dimension)
-index.add(MOJ_embeddings)
-def query_search(query, K):
-    query_embedding = embed_single_text(query)
-    distances, indices = index.search(query_embedding, K)
-    results = []
-    for idx in indices[0]:
-        file_id = df.iloc[idx]['File_ID']
-        row_number = df.iloc[idx]['Row_Number']
-        #results.append((file_id, row_number))
-        results.append(idx)
-    return results
-from sklearn.feature_extraction.text import TfidfVectorizer
-from sklearn.metrics.pairwise import cosine_similarity
-def return_top5_chunks(query):
-  matching_indices = query_search(query, 15)
-  relevant_rows = df.iloc[matching_indices]
-  def chunk_text(text, max_words=150):
-      words = text.split()
-      return [' '.join(words[i:i+max_words]) for i in range(0, len(words), max_words)]
-  relevant_rows['Chunks'] = relevant_rows['Article_text'].apply(chunk_text)
-  chunked_texts = []
-  for idx, row in relevant_rows.iterrows():
-      for chunk in row['Chunks']:
-          chunked_texts.append((chunk, idx))
-  def find_top_k_similar(texts, query, k):
-      documents = [text for text, _ in texts]
-      vectorizer = TfidfVectorizer()
-      all_texts = documents + [query]
-      tfidf_matrix = vectorizer.fit_transform(all_texts)
-      similarities = cosine_similarity(tfidf_matrix[-1], tfidf_matrix[:-1]).flatten()
-      top_k_indices = similarities.argsort()[-k:][::-1]
-      return [(texts[i], similarities[i]) for i in top_k_indices]
-  top_5_chunks = find_top_k_similar(chunked_texts, query, 5)
-  chunks_txt = ''
-  for i, ((chunk, idx), similarity) in enumerate(top_5_chunks):
-      chunks_txt += f"Index: {idx},\nChunk: {chunk}\n"
-      if i < len(top_5_chunks) - 1:
-          chunks_txt += "##########\n"
-  return chunks_txt
-import requests
-api_key = 'UEGtyhQpPCKfhsQ_rPlBbEsgZErSh8xPU57qm9DQ-ZkC'
-url = "https://iam.cloud.ibm.com/identity/token"
-headers = {
-    "Content-Type": "application/x-www-form-urlencoded"
-}
-data = {
-    "grant_type": "urn:ibm:params:oauth:grant-type:apikey",
-    "apikey": api_key
-}
-response = requests.post(url, headers=headers, data=data)
-token_info = response.json()
-access_token = token_info['access_token']
-def allam_response(context, query):
-    url = "https://eu-de.ml.cloud.ibm.com/ml/v1/text/generation?version=2023-05-29"
-    input_text_base = f"""
-    [Context]: {context}
-    [System]:
-    You are an Arabic frindley chatbot named مستنير.
-    You will be provided with an Arabic context ,
-    Your task is to extract and Answer for the questions only from the context provided
-    elaborate on the answer from the context
-    At the end of your response mention the Article : مادة
-    if no answer is found apologize
-    Question: {query}
-    """
-    body = {
-        "input": input_text_base,
-        "parameters": {
-          "decoding_method": "greedy",
-          "max_new_tokens": 900,
-          "min_new_tokens": 0,
-          "stop_sequences": [],
-          "repetition_penalty": 1
-        },
-        "model_id": "sdaia/allam-1-13b-instruct",
-        "project_id": "72a4dcd4-e6e9-4cdc-9c7e-1a0ef1483936"
-    }
-    headers = {
-        "Accept": "application/json",
-        "Content-Type": "application/json",
-        "Authorization": f"Bearer {access_token}"
-    }
-    response = requests.post(url, headers=headers, json=body)
-    if response.status_code != 200:
-        raise Exception("Non-200 response: " + str(response.text))
-    response = response.json()
-    return response['results'][0]['generated_text']
-import json
-import re
-def index_num(text):
-    match = re.search(r'"Index":\s*"(\d+)"', text)
-    index_number = match.group(1) if match else None
-    return int(index_number)
-def get_top_matching_chunk(text, query, max_words=500):
-    def chunk_text(text, max_words):
-        words = text.split()
-        return [' '.join(words[i:i+max_words]) for i in range(0, len(words), max_words)]
-    chunks = chunk_text(text, max_words)
-    vectorizer = TfidfVectorizer()
-    all_texts = chunks + [query]
-    tfidf_matrix = vectorizer.fit_transform(all_texts)
-    similarities = cosine_similarity(tfidf_matrix[-1], tfidf_matrix[:-1]).flatten()
-    top_chunk_index = similarities.argmax()
-    return chunks[top_chunk_index]
-def reformat_indentation(text, indent_spaces=4):
-    indent = ' ' * indent_spaces
-    lines = text.splitlines()
-    formatted_lines = [indent + line.strip() for line in lines]
-    return '\n'.join(formatted_lines)
-def return_index_num(data_text, query):
-    url = "https://eu-de.ml.cloud.ibm.com/ml/v1/text/generation?version=2023-05-29"
-    sys_prompt = """
-    Identify the **first** Index chunk with the answer to a given question.
-    Chunks are seperated by ##########
-    Respond only with **Json** format **do not return any words**:
-    {"Index": "extracted_Index"}
-    Or:
-    {"Index": "not_found"}
-    **No additional text allowed**.
-    """
-    sys_prompt += f"Question : {query}"
-    input_text = f"""
-    [Context]: {data_text.strip()}
-    [System]: {sys_prompt.strip()}
-    """
-    input_text = reformat_indentation(input_text, indent_spaces=0)
-    body = {
-      "input": input_text,
-      "parameters": {
-          "decoding_method": "greedy",
-          "max_new_tokens": 20,
-          "repetition_penalty": 1
-      },
-      "model_id": "sdaia/allam-1-13b-instruct",
-      "project_id": "72a4dcd4-e6e9-4cdc-9c7e-1a0ef1483936"
-    }
-    headers = {
-      "Accept": "application/json",
-      "Content-Type": "application/json",
-      "Authorization": f"Bearer {access_token}"  # access_token must be defined elsewhere
-    }
-    response = requests.post(url, headers=headers, json=body)
-    if response.status_code != 200:
-      raise Exception("Non-200 response: " + str(response.text))
-    response = response.json()
-    return(response['results'][0]['generated_text'])
-def allam_llm(q):
-    chunks_text = return_top5_chunks(q)
-    targeted_chunk = return_index_num(chunks_text, q)
-    index_number = index_num(targeted_chunk)
-    text_to_chunk = df['Article_text'][index_number]
-    top_chunk = get_top_matching_chunk(text_to_chunk, q)
-    allam_res = allam_response(top_chunk, q)
-    return allam_res