Spaces:

ngocminhta
/

falcon-api

Sleeping

App Files Files Community

ngocminhta commited on May 7

Commit

3fef185

1 Parent(s): 5287779

upload hf

Browse files

Files changed (12) hide show

app.py +80 -0
gen_database.py +300 -0
infer.py +130 -0
requirements.txt +11 -0
src/.DS_Store +0 -0
src/__init__.py +0 -0
src/index.py +80 -0
src/simclr.py +280 -0
src/text_embedding.py +55 -0
utils/__init__.py +0 -0
utils/load_dataset.py +205 -0
utils/utils.py +132 -0

app.py ADDED Viewed

	@@ -0,0 +1,80 @@

+from fastapi import FastAPI, Request
+from fastapi.responses import JSONResponse
+from fastapi.middleware.cors import CORSMiddleware
+import torch
+from src.text_embedding import TextEmbeddingModel
+from src.index import Indexer
+import os
+import pickle
+from infer import infer_3_class
+import uvicorn
+app = FastAPI()
+origins = ["*"]
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=origins,
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+class Opt:
+    def __init__(self):
+        self.model_name = "./unsup-simcse-xlm-roberta-base"
+        self.model_path = "core/model.pth"
+        self.database_path = "core/seen_db"
+        self.embedding_dim = 768
+        self.device_num = 1
+opt = Opt()
+def load_pkl(path):
+    with open(path, 'rb') as f:
+        return pickle.load(f)
+@app.on_event("startup")
+def load_model_resources():
+    global model, tokenizer, index, label_dict, is_mixed_dict
+    model = TextEmbeddingModel(opt.model_name)
+    state_dict = torch.load(opt.model_path, map_location=model.model.device)
+    new_state_dict={}
+    for key in state_dict.keys():
+        if key.startswith('model.'):
+            new_state_dict[key[6:]]=state_dict[key]
+    model.load_state_dict(state_dict)
+    tokenizer=model.tokenizer
+    index = Indexer(opt.embedding_dim)
+    index.deserialize_from(opt.database_path)
+    label_dict=load_pkl(os.path.join(opt.database_path,'label_dict.pkl'))
+    is_mixed_dict=load_pkl(os.path.join(opt.database_path,'is_mixed_dict.pkl'))
+@app.route('/predict', methods=['POST'])
+async def predict(request: Request):
+    data = await request.json()
+    mode = data.get("mode", "normal").lower()
+    text_list = data.get("text", [])
+    if mode == "normal":
+        results = []
+        for text in text_list:
+            result = infer_3_class(model=model,
+                tokenizer=tokenizer,
+                index=index,
+                label_dict=label_dict,
+                is_mixed_dict=is_mixed_dict,
+                text=text,
+                K=20)
+            results.append(result)
+        return JSONResponse(content={"results": results})
+    elif mode == "advanced":
+        return 0
+if __name__ == "__main__":
+    port = int(os.getenv("PORT", 8000))
+    uvicorn.run(app, host="0.0.0.0", port=port)

gen_database.py ADDED Viewed

	@@ -0,0 +1,300 @@

+import os
+import pickle
+import random
+import faiss
+from src.index import Indexer
+import torch
+import torch.nn.functional as F
+import numpy as np
+from torch.utils.data import DataLoader
+from lightning import Fabric
+from tqdm import tqdm
+import argparse
+from src.text_embedding import TextEmbeddingModel
+from utils.load_dataset import load_dataset, TextDataset, load_outdomain_dataset
+def load_pkl(path):
+    with open(path, 'rb') as f:
+        return pickle.load(f)
+def infer(passages_dataloder,fabric,tokenizer,model,ood=False):
+    if fabric.global_rank == 0 :
+        passages_dataloder=tqdm(passages_dataloder,total=len(passages_dataloder))
+        if ood:
+            allids, allembeddings,alllabels,all_is_mixed= [],[],[],[]
+        else:
+            allids, allembeddings,alllabels,all_is_mixed,all_write_model= [],[],[],[],[]
+    model.model.eval()
+    with torch.no_grad():
+        for batch in passages_dataloder:
+            if ood:
+                ids, text, label, is_mixed = batch
+                encoded_batch = tokenizer.batch_encode_plus(
+                            text,
+                            return_tensors="pt",
+                            max_length=512,
+                            padding="max_length",
+                            # padding=True,
+                            truncation=True,
+                        )
+                encoded_batch = {k: v.cuda() for k, v in encoded_batch.items()}
+                # output = model(**encoded_batch).last_hidden_state
+                # embeddings = pooling(output, encoded_batch)
+                # print(encoded_batch)
+                embeddings = model(encoded_batch)
+                # print(encoded_batch['input_ids'].shape)
+                embeddings = fabric.all_gather(embeddings).view(-1, embeddings.size(1))
+                label = fabric.all_gather(label).view(-1)
+                ids = fabric.all_gather(ids).view(-1)
+                is_mixed = fabric.all_gather(is_mixed).view(-1)
+                if fabric.global_rank == 0 :
+                    allembeddings.append(embeddings.cpu())
+                    allids.extend(ids.cpu().tolist())
+                    alllabels.extend(label.cpu().tolist())
+                    all_is_mixed.extend(is_mixed.cpu().tolist())
+            else:
+                ids, text, label, is_mixed, write_model = batch
+                encoded_batch = tokenizer.batch_encode_plus(
+                            text,
+                            return_tensors="pt",
+                            max_length=512,
+                            padding="max_length",
+                            # padding=True,
+                            truncation=True,
+                        )
+                encoded_batch = {k: v.cuda() for k, v in encoded_batch.items()}
+                # output = model(**encoded_batch).last_hidden_state
+                # embeddings = pooling(output, encoded_batch)
+                # print(encoded_batch)
+                embeddings = model(encoded_batch)
+                # print(encoded_batch['input_ids'].shape)
+                embeddings = fabric.all_gather(embeddings).view(-1, embeddings.size(1))
+                label = fabric.all_gather(label).view(-1)
+                ids = fabric.all_gather(ids).view(-1)
+                is_mixed = fabric.all_gather(is_mixed).view(-1)
+                write_model = fabric.all_gather(write_model).view(-1)
+                if fabric.global_rank == 0 :
+                    allembeddings.append(embeddings.cpu())
+                    allids.extend(ids.cpu().tolist())
+                    alllabels.extend(label.cpu().tolist())
+                    all_is_mixed.extend(is_mixed.cpu().tolist())
+                    all_write_model.extend(write_model.cpu().tolist())
+    if fabric.global_rank == 0 :
+        allembeddings = torch.cat(allembeddings, dim=0)
+        epsilon = 1e-6
+        if ood:
+            emb_dict,label_dict,is_mixed_dict={},{},{}
+            allembeddings= F.normalize(allembeddings,dim=-1)
+            for i in range(len(allids)):
+                emb_dict[allids[i]]=allembeddings[i]
+                label_dict[allids[i]]=alllabels[i]
+                is_mixed_dict[allids[i]]=all_is_mixed[i]
+            allids,allembeddings,alllabels,all_is_mixed=[],[],[],[]
+            for key in emb_dict:
+                allids.append(key)
+                allembeddings.append(emb_dict[key])
+                alllabels.append(label_dict[key])
+                all_is_mixed.append(is_mixed_dict[key])
+            allembeddings = torch.stack(allembeddings, dim=0)
+            return allids,allembeddings.numpy(),alllabels,all_is_mixed
+        else:
+            emb_dict,label_dict,is_mixed_dict,write_model_dict={},{},{},{}
+            allembeddings= F.normalize(allembeddings,dim=-1)
+            for i in range(len(allids)):
+                emb_dict[allids[i]]=allembeddings[i]
+                label_dict[allids[i]]=alllabels[i]
+                is_mixed_dict[allids[i]]=all_is_mixed[i]
+                write_model_dict[allids[i]]=all_write_model[i]
+            allids,allembeddings,alllabels,all_is_mixed,all_write_model=[],[],[],[],[]
+            for key in emb_dict:
+                allids.append(key)
+                allembeddings.append(emb_dict[key])
+                alllabels.append(label_dict[key])
+                all_is_mixed.append(is_mixed_dict[key])
+                all_write_model.append(write_model_dict[key])
+            allembeddings = torch.stack(allembeddings, dim=0)
+            return allids, allembeddings.numpy(),alllabels,all_is_mixed,all_write_model
+    else:
+        if ood:
+            return [],[],[],[]
+        return [],[],[],[],[]
+def set_seed(seed):
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)  # if you are using multi-GPU.
+    np.random.seed(seed)  # Numpy module.
+    random.seed(seed)  # Python random module.
+def test(opt):
+    if opt.device_num>1:
+        fabric = Fabric(accelerator="cuda",devices=opt.device_num,strategy='ddp')
+    else:
+        fabric = Fabric(accelerator="cuda",devices=opt.device_num)
+    fabric.launch()
+    model = TextEmbeddingModel(opt.model_name).cuda()
+    state_dict = torch.load(opt.model_path, map_location=model.model.device)
+    new_state_dict={}
+    for key in state_dict.keys():
+        if key.startswith('model.'):
+            new_state_dict[key[6:]]=state_dict[key]
+    model.load_state_dict(state_dict)
+    tokenizer=model.tokenizer
+    database = load_dataset(opt.dataset_name,opt.database_path)[opt.database_name]
+    passage_dataset = TextDataset(database,need_ids=True)
+    print(len(passage_dataset))
+    passages_dataloder = DataLoader(passage_dataset, batch_size=opt.batch_size, num_workers=opt.num_workers, pin_memory=True)
+    passages_dataloder=fabric.setup_dataloaders(passages_dataloder)
+    model=fabric.setup(model)
+    train_ids, train_embeddings,train_labels, train_is_mixed, train_write_model = infer(passages_dataloder,fabric,tokenizer,model)
+    fabric.barrier()
+    if fabric.global_rank == 0:
+        index = Indexer(opt.embedding_dim)
+        index.index_data(train_ids, train_embeddings)
+        label_dict={}
+        is_mixed_dict={}
+        write_model_dict={}
+        for i in range(len(train_ids)):
+            label_dict[train_ids[i]]=train_labels[i]
+            is_mixed_dict[train_ids[i]]=train_is_mixed[i]
+            write_model_dict[train_ids[i]]=train_write_model[i]
+        if not os.path.exists(opt.save_path):
+            os.makedirs(opt.save_path)
+        index.serialize(opt.save_path)
+        #save label_dict using pickle
+        with open(os.path.join(opt.save_path, 'label_dict.pkl'), 'wb') as f:
+            pickle.dump(label_dict, f)
+        #save is_mixed_dict using pickle
+        with open(os.path.join(opt.save_path, 'is_mixed_dict.pkl'), 'wb') as f:
+            pickle.dump(is_mixed_dict, f)
+        #save write_model_dict using pickle
+        with open(os.path.join(opt.save_path, 'write_model_dict.pkl'), 'wb') as f:
+            pickle.dump(write_model_dict, f)
+def add_to_existed_index(opt):
+    if opt.device_num>1:
+        fabric = Fabric(accelerator="cuda",devices=opt.device_num,strategy='ddp')
+    else:
+        fabric = Fabric(accelerator="cuda",devices=opt.device_num)
+    fabric.launch()
+    model = TextEmbeddingModel(opt.model_name).cuda()
+    state_dict = torch.load(opt.model_path, map_location=model.model.device)
+    new_state_dict={}
+    for key in state_dict.keys():
+        if key.startswith('model.'):
+            new_state_dict[key[6:]]=state_dict[key]
+    model.load_state_dict(state_dict)
+    tokenizer=model.tokenizer
+    if opt.ood:
+        database = load_outdomain_dataset(opt.database_path)[opt.database_name]
+    else:
+        database = load_dataset(opt.dataset_name,opt.database_path)[opt.database_name]
+    passage_dataset = TextDataset(database,need_ids=True,out_domain=opt.ood)
+    print(len(passage_dataset))
+    passages_dataloder = DataLoader(passage_dataset, batch_size=opt.batch_size, num_workers=opt.num_workers, pin_memory=True)
+    passages_dataloder=fabric.setup_dataloaders(passages_dataloder)
+    model=fabric.setup(model)
+    if opt.ood:
+        train_ids, train_embeddings,train_labels, train_is_mixed = infer(passages_dataloder,fabric,tokenizer,model,ood=True)
+    else:
+        train_ids, train_embeddings,train_labels, train_is_mixed, train_write_model = infer(passages_dataloder,fabric,tokenizer,model)
+    fabric.barrier()
+    if fabric.global_rank == 0:
+        new_index = Indexer(opt.embedding_dim)
+        new_index.index_data(train_ids, train_embeddings)
+        old_index = Indexer(opt.embedding_dim)
+        old_index.deserialize_from(opt.existed_index_path)
+        old_ids = old_index.index_id_to_db_id
+        # Ensure both indexes are of type IndexFlatIP
+        # assert isinstance(new_index.index, faiss.IndexFlatIP)
+        # assert isinstance(old_index.index, faiss.IndexFlatIP)
+        # Ensure both indexes have the same dimensionality
+        assert new_index.index.d == old_index.index.d
+        # Extract vectors from old_index.index
+        vectors = old_index.index.reconstruct_n(0, old_index.index.ntotal)
+        # Add vectors to new_index.index
+        new_index.index_data(old_ids, vectors)
+        if not os.path.exists(opt.new_save_path):
+            os.makedirs(opt.new_save_path)
+        new_index.serialize(opt.new_save_path)
+        if opt.ood:
+            label_dict=load_pkl(os.path.join(opt.existed_index_path, 'label_dict.pkl'))
+            is_mixed_dict=load_pkl(os.path.join(opt.existed_index_path, 'is_mixed_dict.pkl'))
+            for i in range(len(train_ids)):
+                label_dict[train_ids[i]]=train_labels[i]
+                is_mixed_dict[train_ids[i]]=train_is_mixed[i]
+            #save label_dict using pickle
+            with open(os.path.join(opt.new_save_path, 'label_dict.pkl'), 'wb') as f:
+                pickle.dump(label_dict, f)
+            #save is_mixed_dict using pickle
+            with open(os.path.join(opt.new_save_path, 'is_mixed_dict.pkl'), 'wb') as f:
+                pickle.dump(is_mixed_dict, f)
+        else:
+            label_dict=load_pkl(os.path.join(opt.existed_index_path, 'label_dict.pkl'))
+            is_mixed_dict=load_pkl(os.path.join(opt.existed_index_path, 'is_mixed_dict.pkl'))
+            write_model_dict=load_pkl(os.path.join(opt.existed_index_path, 'write_model_dict.pkl'))
+            for i in range(len(train_ids)):
+                label_dict[train_ids[i]]=train_labels[i]
+                is_mixed_dict[train_ids[i]]=train_is_mixed[i]
+                write_model_dict[train_ids[i]]=train_write_model[i]
+            #save label_dict using pickle
+            with open(os.path.join(opt.new_save_path, 'label_dict.pkl'), 'wb') as f:
+                pickle.dump(label_dict, f)
+            #save is_mixed_dict using pickle
+            with open(os.path.join(opt.new_save_path, 'is_mixed_dict.pkl'), 'wb') as f:
+                pickle.dump(is_mixed_dict, f)
+            #save write_model_dict using pickle
+            with open(os.path.join(opt.new_save_path, 'write_model_dict.pkl'), 'wb') as f:
+                pickle.dump(write_model_dict, f)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--device_num', type=int, default=1)
+    parser.add_argument('--batch_size', type=int, default=128)
+    parser.add_argument('--num_workers', type=int, default=8)
+    parser.add_argument('--embedding_dim', type=int, default=768)
+    # parser.add_argument('--mode', type=str, default='deepfake', help="deepfake,MGT or MGTDetect_CoCo")
+    parser.add_argument("--database_path", type=str, default="data/FALCONSet", help="Path to the data")
+    parser.add_argument('--dataset_name', type=str, default='falconset', help="falconset, llmdetectaive, hart")
+    parser.add_argument('--database_name', type=str, default='train', help="train,valid,test,test_ood")
+    parser.add_argument("--model_path", type=str, default="runs/authscan_v6/model_best.pth",\
+                         help="Path to the embedding model checkpoint")
+    parser.add_argument('--model_name', type=str, default="FacebookAI/xlm-roberta-base", help="Model name")
+    parser.add_argument("--save_path", type=str, default="/output", help="Path to save the database")
+    parser.add_argument("--add_to_existed_index", type=int, default=0)
+    # parser.add_argument("--add_to_existed_index_path", type=str, default="/output", help="Path to save the database")
+    parser.add_argument("--ood", type=int, default=0)
+    parser.add_argument("--existed_index_path", type=str, default="/output", help="Path of existed index")
+    parser.add_argument("--new_save_path", type=str, default="/new_db", help="Path to save the database")
+    parser.add_argument('--seed', type=int, default=0)
+    opt = parser.parse_args()
+    set_seed(opt.seed)
+    if not opt.add_to_existed_index:
+        test(opt)
+    else:
+        add_to_existed_index(opt)

infer.py ADDED Viewed

	@@ -0,0 +1,130 @@

+import os
+import pickle
+import numpy as np
+from src.index import Indexer
+import torch
+import argparse
+from src.text_embedding import TextEmbeddingModel
+import random
+from collections import Counter
+def softmax_weights(scores, temperature=1.0):
+    scores = np.array(scores)
+    scores = scores / temperature
+    e_scores = np.exp(scores - np.max(scores))
+    return e_scores / np.sum(e_scores)
+def normalize_fuzzy_cnt(fuzzy_cnt):
+    total = sum(fuzzy_cnt.values())
+    if total == 0:
+        return fuzzy_cnt
+    for key in fuzzy_cnt:
+        fuzzy_cnt[key] /= total
+    return fuzzy_cnt
+def class_type_boost(query_type, candidate_type):
+    if query_type == candidate_type:
+        return 1.3
+    elif abs(query_type - candidate_type) == 1:
+        return 1.1
+    elif abs(query_type - candidate_type) == 2:
+        return 0.9
+    else:
+        return 0.8
+def set_seed(seed):
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)  # if you are using multi-GPU.
+    np.random.seed(seed)  # Numpy module.
+    random.seed(seed)  # Python random module.
+def load_pkl(path):
+    with open(path, 'rb') as f:
+        return pickle.load(f)
+def infer_3_class(model, tokenizer, index, label_dict, is_mixed_dict, text, K):
+    # model = TextEmbeddingModel(opt.model_name).cuda()
+    # state_dict = torch.load(opt.model_path, map_location=model.model.device)
+    # new_state_dict={}
+    # for key in state_dict.keys():
+    #     if key.startswith('model.'):
+    #         new_state_dict[key[6:]]=state_dict[key]
+    # model.load_state_dict(state_dict)
+    # tokenizer=model.tokenizer
+    # index = Indexer(opt.embedding_dim)
+    # index.deserialize_from(opt.database_path)
+    # label_dict=load_pkl(os.path.join(opt.database_path,'label_dict.pkl'))
+    # is_mixed_dict=load_pkl(os.path.join(opt.database_path,'is_mixed_dict.pkl'))
+    # text = opt.text
+    encoded_text = tokenizer.batch_encode_plus(
+                        [text],
+                        return_tensors="pt",
+                        max_length=512,
+                        padding="max_length",
+                        truncation=True,
+                    )
+    encoded_text = {k: v for k, v in encoded_text.items()}
+    embeddings = model(encoded_text).cpu().detach().numpy()
+    top_ids_and_scores = index.search_knn(embeddings, K)
+    pred = []
+    for i, (ids, scores) in enumerate(top_ids_and_scores):
+        print(f"Top {K} results for text:")
+        sorted_scores = np.argsort(scores)
+        sorted_scores = sorted_scores[::-1]
+        topk_ids = [ids[j] for j in sorted_scores]
+        topk_scores = [scores[j] for j in sorted_scores]
+        weights = softmax_weights(topk_scores, temperature=0.1)
+        candidate_models = [is_mixed_dict[int(_id)] for _id in topk_ids]
+        initial_pred = Counter(candidate_models).most_common(1)[0][0]
+        fuzzy_cnt = {(1,0): 0.0, (0,10^3): 0.0, (1,1): 0.0}
+        for id, weight in zip(topk_ids, weights):
+            label = (label_dict[int(id)], is_mixed_dict[int(id)])
+            boost = class_type_boost(is_mixed_dict[int(id)],initial_pred)
+            fuzzy_cnt[label] += weight * boost
+        final = max(fuzzy_cnt, key=fuzzy_cnt.get)
+        # print(f"Top {opt.K} results for text:")
+        # cnt = {(1,0):0,(0,10^3):0,(1,1):0}
+        # for j, (id, score) in enumerate(zip(ids, scores)):
+        #     print(f"{j+1}. ID {id} Label {label_dict[int(id)]} Is_mixed {is_mixed_dict[int(id)]} Score {score}")
+        #     cnt[(label_dict[int(id)], is_mixed_dict[int(id)])]+=1
+        # final = max(cnt, key=cnt.get)
+        # pred.append(final)
+        if final==(1,0):
+            print("Human")
+            return 0
+        elif final==(0,10^3):
+            print("AI")
+            return 1
+        else:
+            print("Mixed")
+            return 2
+        # pred.append(final)
+    return -1
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--embedding_dim', type=int, default=768)
+    parser.add_argument('--database_path', type=str, default="database", help="Path to the index file")
+    parser.add_argument("--model_path", type=str, default="core/model.pth",\
+                         help="Path to the embedding model checkpoint")
+    parser.add_argument('--model_name', type=str, default="ZurichNLPZurichNLP/unsup-simcse-xlm-roberta-base", help="Model name")
+    parser.add_argument('--K', type=int, default=20, help="Search [1,K] nearest neighbors,choose the best K")
+    parser.add_argument('--pooling', type=str, default="average", help="Pooling method, average or cls")
+    parser.add_argument('--text', type=str, default="")
+    parser.add_argument('--seed', type=int, default=0)
+    opt = parser.parse_args()
+    set_seed(opt.seed)
+    infer(opt)

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+pandas~=2.0.3
+tqdm~=4.66.4
+torch~=2.3.0
+transformers~=4.41.1
+scikit-learn~=1.3.2
+datasets~=2.19.1
+nltk~=3.8.1
+tiktoken~=0.7.0
+faiss-cpu
+uvicorn
+fastapi

src/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

src/__init__.py ADDED Viewed

File without changes

src/index.py ADDED Viewed

	@@ -0,0 +1,80 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import os
+import pickle
+from typing import List, Tuple
+import faiss
+import numpy as np
+from tqdm import tqdm
+class Indexer(object):
+    def __init__(self, vector_sz,device='cpu'):
+        self.index = faiss.IndexFlatIP(vector_sz)
+        self.device = device
+        if self.device == 'cuda':
+            self.index = faiss.index_cpu_to_all_gpus(self.index)
+        self.index_id_to_db_id = []
+    def index_data(self, ids, embeddings):
+        self._update_id_mapping(ids)
+        embeddings = embeddings.astype('float32')
+        if not self.index.is_trained:
+            self.index.train(embeddings)
+        self.index.add(embeddings)
+        print(f'Total data indexed {self.index.ntotal}')
+    def search_knn(self, query_vectors: np.array, top_docs: int, index_batch_size: int = 8) -> List[Tuple[List[object], List[float]]]:
+        query_vectors = query_vectors.astype('float32')
+        result = []
+        nbatch = (len(query_vectors)-1) // index_batch_size + 1
+        for k in tqdm(range(nbatch)):
+            start_idx = k*index_batch_size
+            end_idx = min((k+1)*index_batch_size, len(query_vectors))
+            q = query_vectors[start_idx: end_idx]
+            scores, indexes = self.index.search(q, top_docs)
+            # convert to external ids
+            db_ids = [[str(self.index_id_to_db_id[i]) for i in query_top_idxs] for query_top_idxs in indexes]
+            result.extend([(db_ids[i], scores[i]) for i in range(len(db_ids))])
+        return result
+    def serialize(self, dir_path):
+        index_file = os.path.join(dir_path, 'index.faiss')
+        meta_file = os.path.join(dir_path, 'index_meta.faiss')
+        print(f'Serializing index to {index_file}, meta data to {meta_file}')
+        if self.device == 'cuda':
+            save_index = faiss.index_gpu_to_cpu(self.index)
+        else:
+            save_index = self.index
+        faiss.write_index(save_index, index_file)
+        with open(meta_file, mode='wb') as f:
+            pickle.dump(self.index_id_to_db_id, f)
+    def deserialize_from(self, dir_path):
+        index_file = os.path.join(dir_path, 'index.faiss')
+        meta_file = os.path.join(dir_path, 'index_meta.faiss')
+        print(f'Loading index from {index_file}, meta data from {meta_file}')
+        self.index = faiss.read_index(index_file)
+        if self.device == 'cuda':
+            self.index = faiss.index_cpu_to_all_gpus(self.index)
+        print('Loaded index of type %s and size %d', type(self.index), self.index.ntotal)
+        with open(meta_file, "rb") as reader:
+            self.index_id_to_db_id = pickle.load(reader)
+        assert len(
+            self.index_id_to_db_id) == self.index.ntotal, 'Deserialized index_id_to_db_id should match faiss index size'
+    def _update_id_mapping(self, db_ids: List):
+        self.index_id_to_db_id.extend(db_ids)
+    def reset(self):
+        self.index.reset()
+        self.index_id_to_db_id = []
+        print(f'Index reset, total data indexed {self.index.ntotal}')

src/simclr.py ADDED Viewed

	@@ -0,0 +1,280 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from src.text_embedding import TextEmbeddingModel
+class ClassificationHead(nn.Module):
+    """Head for sentence-level classification tasks."""
+    def __init__(self, in_dim, out_dim):
+        super(ClassificationHead, self).__init__()
+        self.dense1 = nn.Linear(in_dim, in_dim//4)
+        self.dense2 = nn.Linear(in_dim//4, in_dim//16)
+        self.out_proj = nn.Linear(in_dim//16, out_dim)
+        nn.init.xavier_uniform_(self.dense1.weight)
+        nn.init.xavier_uniform_(self.dense2.weight)
+        nn.init.xavier_uniform_(self.out_proj.weight)
+        nn.init.normal_(self.dense1.bias, std=1e-6)
+        nn.init.normal_(self.dense2.bias, std=1e-6)
+        nn.init.normal_(self.out_proj.bias, std=1e-6)
+    def forward(self, features):
+        x = features
+        x = self.dense1(x)
+        x = torch.tanh(x)
+        x = self.dense2(x)
+        x = torch.tanh(x)
+        x = self.out_proj(x)
+        return x
+class SimCLR_Classifier_SCL(nn.Module):
+    def __init__(self, opt,fabric):
+        super(SimCLR_Classifier_SCL, self).__init__()
+        self.temperature = opt.temperature
+        self.opt=opt
+        self.fabric = fabric
+        self.model = TextEmbeddingModel(opt.model_name)
+        self.device=self.model.model.device
+        if opt.resum:
+            state_dict = torch.load(opt.pth_path, map_location=self.device)
+            self.model.load_state_dict(state_dict)
+        self.esp=torch.tensor(1e-6,device=self.device)
+        self.classifier = ClassificationHead(opt.projection_size, opt.classifier_dim)
+        self.a=torch.tensor(opt.a,device=self.device)
+        self.d=torch.tensor(opt.d,device=self.device)
+        self.only_classifier=opt.only_classifier
+    def get_encoder(self):
+        return self.model
+    def _compute_logits(self, q,q_index1, q_index2,q_label,k,k_index1,k_index2,k_label):
+        def cosine_similarity_matrix(q, k):
+            q_norm = F.normalize(q,dim=-1)
+            k_norm = F.normalize(k,dim=-1)
+            cosine_similarity = q_norm@k_norm.T
+            return cosine_similarity
+        logits=cosine_similarity_matrix(q,k)/self.temperature
+        q_labels=q_label.view(-1, 1)# N,1
+        k_labels=k_label.view(1, -1)# 1,N+K
+        same_label=(q_labels==k_labels)# N,N+K
+        #model:model set
+        pos_logits_model = torch.sum(logits*same_label,dim=1)/torch.max(torch.sum(same_label,dim=1),self.esp)
+        neg_logits_model=logits*torch.logical_not(same_label)
+        logits_model=torch.cat((pos_logits_model.unsqueeze(1), neg_logits_model), dim=1)
+        return logits_model
+    def forward(self, batch, indices1, indices2,label):
+        bsz = batch['input_ids'].size(0)
+        q = self.model(batch)
+        k = q.clone().detach()
+        k = self.fabric.all_gather(k).view(-1, k.size(1))
+        k_label = self.fabric.all_gather(label).view(-1)
+        k_index1 = self.fabric.all_gather(indices1).view(-1)
+        k_index2 = self.fabric.all_gather(indices2).view(-1)
+        #q:N
+        #k:4N
+        logits_label = self._compute_logits(q,indices1, indices2,label,k,k_index1,k_index2,k_label)
+        out = self.classifier(q)
+        if self.opt.AA:
+            loss_classfiy = F.cross_entropy(out, indices1)
+        else:
+            loss_classfiy = F.cross_entropy(out, label)
+        gt = torch.zeros(bsz, dtype=torch.long,device=logits_label.device)
+        if self.only_classifier:
+            loss_label = torch.tensor(0,device=self.device)
+        else:
+            loss_label = F.cross_entropy(logits_label, gt)
+        loss = self.a*loss_label+self.d*loss_classfiy
+        if self.training:
+            return loss,loss_label,loss_classfiy,k,k_label
+        else:
+            out = self.fabric.all_gather(out).view(-1, out.size(1))
+            return loss,out,k,k_label
+class SimCLR_Classifier_test(nn.Module):
+    def __init__(self, opt,fabric):
+        super(SimCLR_Classifier_test, self).__init__()
+        self.fabric = fabric
+        self.model = TextEmbeddingModel(opt.model_name)
+        self.classifier = ClassificationHead(opt.projection_size, opt.classifier_dim)
+        self.device=self.model.model.device
+    def forward(self, batch):
+        q = self.model(batch)
+        out = self.classifier(q)
+        return out
+class SimCLR_Classifier(nn.Module):
+    def __init__(self, opt,fabric):
+        super(SimCLR_Classifier, self).__init__()
+        self.temperature = opt.temperature
+        self.opt=opt
+        self.fabric = fabric
+        self.model = TextEmbeddingModel(opt.model_name)
+        if opt.resum:
+            state_dict = torch.load(opt.pth_path,
+                                    map_location=self.model.device)
+            self.model.load_state_dict(state_dict)
+        self.device = self.model.model.device
+        self.esp = torch.tensor(1e-6,device=self.device)
+        self.a = torch.tensor(opt.a,
+                              device=self.device)
+        self.b = torch.tensor(opt.b,
+                              device=self.device)
+        self.c = torch.tensor(opt.c,
+                              device=self.device)
+        self.classifier = ClassificationHead(opt.projection_size,
+                                             opt.classifier_dim)
+        self.only_classifier = opt.only_classifier
+    def get_encoder(self):
+        return self.model
+    def _compute_logits(self,
+                        q,q_index1, q_index2, q_label,
+                        k,k_index1,k_index2,k_label):
+        def cosine_similarity_matrix(q, k):
+            q_norm = F.normalize(q,dim=-1)
+            k_norm = F.normalize(k,dim=-1)
+            cosine_similarity = q_norm@k_norm.T
+            return cosine_similarity
+        logits=cosine_similarity_matrix(q,k)/self.temperature
+        q_index1=q_index1.view(-1, 1)# change to tensor of size N, 1
+        q_index2=q_index2.view(-1, 1)# change to tensor of size N, 1
+        q_labels=q_label.view(-1, 1)# change to tensor of size N, 1
+        k_index1=k_index1.view(1, -1)# 1,N+K
+        k_index2=k_index2.view(1, -1) #1, N+K
+        k_labels=k_label.view(1, -1)# 1,N+K
+        same_mixed = (q_index1== k_index1)
+        same_set=(q_index2==k_index2)# N,N+K
+        same_label=(q_labels==k_labels)# N,N+K
+        is_human=(q_label==1).view(-1)
+        is_machine=(q_label==0).view(-1)
+        is_mixed=(q_index1==1).view(-1)
+        #human: human
+        pos_logits_human = torch.sum(logits*same_label,dim=1)/torch.max(torch.sum(same_label,dim=1),self.esp)
+        neg_logits_human=logits*torch.logical_not(same_label)
+        logits_human=torch.cat((pos_logits_human.unsqueeze(1), neg_logits_human), dim=1)
+        logits_human=logits_human[is_human]
+        #human+ai: general
+        pos_logits_mixed = torch.sum(logits*same_mixed,dim=1)/torch.maximum(torch.sum(same_mixed,dim=1),self.esp)
+        neg_logits_mixed=logits*torch.logical_not(same_mixed)
+        logits_mixed=torch.cat((pos_logits_mixed.unsqueeze(1), neg_logits_mixed), dim=1)
+        logits_mixed=logits_mixed[is_mixed]
+        #human+ai: model
+        pos_logits_mixed_set = torch.sum(logits*torch.logical_and(same_mixed, same_set),dim=1)/torch.max(torch.sum(torch.logical_and(same_mixed, same_set),dim=1),self.esp)
+        neg_logits_mixed_set=logits*torch.logical_not(torch.logical_and(same_mixed, same_set))
+        logits_mixed_set=torch.cat((pos_logits_mixed_set.unsqueeze(1), neg_logits_mixed_set), dim=1)
+        logits_mixed_set=logits_mixed_set[is_mixed]
+        #model set:label
+        pos_logits_set = torch.sum(logits*same_set,dim=1)/torch.max(torch.sum(same_set,dim=1),self.esp)
+        neg_logits_set=logits*torch.logical_not(same_set)
+        logits_set=torch.cat((pos_logits_set.unsqueeze(1), neg_logits_set), dim=1)
+        logits_set=logits_set[is_machine]
+        #label: label
+        pos_logits_label = torch.sum(logits*same_label, dim=1)/torch.max(torch.sum(same_label,dim=1),self.esp)
+        neg_logits_label=logits*torch.logical_not(same_label)
+        logits_label=torch.cat((pos_logits_label.unsqueeze(1), neg_logits_label), dim=1)
+        logits_label=logits_label[is_machine]
+        return logits_human, logits_mixed, logits_mixed_set, logits_set, logits_label
+    def forward(self, encoded_batch, label, indices1, indices2):#, weights):
+        # print(len(text))
+        q = self.model(encoded_batch)
+        k = q.clone().detach()
+        k = self.fabric.all_gather(k).view(-1, k.size(1))
+        k_label = self.fabric.all_gather(label).view(-1)
+        k_index1 = self.fabric.all_gather(indices1).view(-1)
+        k_index2 = self.fabric.all_gather(indices2).view(-1)
+        #q:N
+        #k:4N
+        logits_human, logits_mixed, logits_mixed_set, logits_set, logits_label = self._compute_logits(q,indices1, indices2,label,
+                                                                                 k,k_index1,k_index2,k_label)
+        out = self.classifier(q)
+        if self.opt.AA:
+            loss_classfiy = F.cross_entropy(out, indices1)
+        else:
+            loss_classfiy = F.cross_entropy(out, label) #, weight=weights)
+        gt_mixed = torch.zeros(logits_mixed.size(0),
+                               dtype=torch.long,
+                               device=logits_mixed.device)
+        gt_mixed_set = torch.zeros(logits_mixed_set.size(0),
+                               dtype=torch.long,
+                               device=logits_mixed_set.device)
+        gt_set = torch.zeros(logits_set.size(0),
+                             dtype=torch.long,
+                             device=logits_set.device)
+        gt_label = torch.zeros(logits_label.size(0),
+                               dtype=torch.long,
+                               device=logits_label.device)
+        gt_human = torch.zeros(logits_human.size(0),
+                               dtype=torch.long,
+                               device=logits_human.device)
+        loss_mixed = F.cross_entropy(logits_mixed,
+                                   gt_mixed)
+        loss_mixed_set = F.cross_entropy(logits_mixed_set,
+                                   gt_mixed_set)
+        loss_set = F.cross_entropy(logits_set,
+                                   gt_set)
+        loss_label = F.cross_entropy(logits_label,
+                                     gt_label)
+        if logits_human.numel()!=0:
+            loss_human = F.cross_entropy(logits_human.to(torch.float64),
+                                         gt_human)
+        else:
+            loss_human=torch.tensor(0,device=self.device)
+        loss = self.a*loss_set + (4*self.b-self.a)*loss_label + self.b*loss_human+ self.b*loss_mixed + \
+                2*self.b*loss_mixed_set+self.c*loss_classfiy
+        if self.training:
+            if self.opt.AA:
+                return loss,loss_mixed, loss_mixed_set,loss_set,loss_label,loss_human,loss_classfiy,k,k_index1
+            else:
+                return loss,loss_mixed, loss_mixed_set,loss_set,loss_label,loss_classfiy,loss_human,k,k_label
+        else:
+            out = self.fabric.all_gather(out).view(-1, out.size(1))
+            if self.opt.AA:
+                return loss,out,k,k_index1
+            else:
+                return loss,out,k,k_label

src/text_embedding.py ADDED Viewed

	@@ -0,0 +1,55 @@

+import torch
+import torch.nn as nn
+from transformers import AutoTokenizer, AutoModel
+class TextEmbeddingModel(nn.Module):
+    def __init__(self, model_name,output_hidden_states=False):
+        super(TextEmbeddingModel, self).__init__()
+        self.model_name = model_name
+        if output_hidden_states:
+            self.model = AutoModel.from_pretrained(model_name, trust_remote_code=True, output_hidden_states=True)
+        else:
+            self.model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+    def pooling(self, model_output, attention_mask, use_pooling='average',hidden_states=False):
+        if hidden_states:
+            model_output.masked_fill(~attention_mask[None,..., None].bool(), 0.0)
+            if use_pooling == "average":
+                emb = model_output.sum(dim=2) / attention_mask.sum(dim=1)[..., None]
+            else:
+                emb = model_output[:,:, 0]
+            emb = emb.permute(1, 0, 2)
+        else:
+            model_output.masked_fill(~attention_mask[..., None].bool(), 0.0)
+            if use_pooling == "average":
+                emb = model_output.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
+            elif use_pooling == "cls":
+                emb = model_output[:, 0]
+        return emb
+    def forward(self, encoded_batch, use_pooling='average',hidden_states=False):
+        if "t5" in self.model_name.lower():
+            input_ids = encoded_batch['input_ids']
+            decoder_input_ids = torch.zeros((input_ids.shape[0], 1), dtype=torch.long, device=input_ids.device)
+            model_output = self.model(**encoded_batch,
+                                  decoder_input_ids=decoder_input_ids)
+        else:
+            model_output = self.model(**encoded_batch)
+        if 'bge' in self.model_name.lower() or 'mxbai' in self.model_name.lower():
+            use_pooling = 'cls'
+        if isinstance(model_output, tuple):
+            model_output = model_output[0]
+        if isinstance(model_output, dict):
+            if hidden_states:
+                model_output = model_output["hidden_states"]
+                model_output = torch.stack(model_output, dim=0)
+            else:
+                model_output = model_output["last_hidden_state"]
+        emb = self.pooling(model_output, encoded_batch['attention_mask'], use_pooling,hidden_states)
+        emb = torch.nn.functional.normalize(emb, dim=-1)
+        return emb

utils/__init__.py ADDED Viewed

File without changes

utils/load_dataset.py ADDED Viewed

	@@ -0,0 +1,205 @@

+from torch.utils.data import Dataset
+import os
+import json
+import random
+import hashlib
+def stable_long_hash(input_string):
+    hash_object = hashlib.sha256(input_string.encode())
+    hex_digest = hash_object.hexdigest()
+    int_hash = int(hex_digest, 16)
+    long_long_hash = (int_hash & ((1 << 63) - 1))
+    return long_long_hash
+model_map_authscan = {
+    "gpt-4o-mini-text": 1,
+    "gemini-2.0-text": 2,
+    "deepseek-text": 3,
+    "llama-text": 4
+}
+model_map_llmdetectaive = {
+    "gemma-text": 1,
+    "mixtral-text": 2,
+    "llama3-text": 3
+}
+model_map_hart = {
+    "claude-text": 1,
+    "gemini-text": 2,
+    "gpt-text": 3
+}
+def load_dataset(dataset_name,path=None):
+    dataset = {
+        "train": [],
+        "valid": [],
+        "test": []
+    }
+    if dataset_name == "falconset":
+        model_map = model_map_authscan
+    elif dataset_name == "llmdetectaive":
+        model_map = model_map_llmdetectaive
+    elif dataset_name == "hart":
+        model_map = model_map_hart
+    folder = os.listdir(path)
+    # print(folder)
+    for sub in folder:
+        sub_path = os.path.join(path, sub)
+        files = os.listdir(sub_path)
+        for file in files:
+            if not file.endswith('.jsonl'):
+                continue
+            file_path = os.path.join(sub_path, file)
+            key_name = file.split('.')[0]
+            assert key_name in dataset.keys(), f'{key_name} is not in dataset.keys()'
+            with open(file_path, 'r') as f:
+                data = [json.loads(line) for line in f]
+            for i in range(len(data)):
+                dct = {}
+                dct['text'] = data[i]['text']
+                if sub == "human-text":
+                    dct['label'] = "human"
+                    dct['label_detailed'] = "human"
+                    dct['index'] = (1,0,0)
+                elif sub.startswith("human---"):
+                    dct['label'] = "human+AI"
+                    model = sub.split("---")[1]
+                    dct['label_detailed'] = model
+                    dct['index'] = (1, 1, model_map[model])
+                else:
+                    dct['label'] = "AI"
+                    dct['label_detailed'] = sub
+                    dct['index'] = (0, 10^3, model_map[sub])
+                dataset[key_name].append(dct)
+    return dataset
+def load_outdomain_dataset(path):
+    dataset = {
+        "valid": [],
+        "test": []
+    }
+    folder = os.listdir(path)
+    for sub in folder:
+        sub_path = os.path.join(path, sub)
+        files = os.listdir(sub_path)
+        for file in files:
+            if not file.endswith('.jsonl'):
+                continue
+            file_path = os.path.join(sub_path, file)
+            key_name = file.split('.')[0]
+            assert key_name in dataset.keys(), f'{key_name} is not in dataset.keys()'
+            with open(file_path, 'r', encoding='utf-8') as f:
+                data = [json.loads(line) for line in f]
+            for i in range(len(data)):
+                dct = {}
+                dct['text'] = data[i]['text']
+                if sub == "human-text":
+                    dct['label'] = "human"
+                    dct['label_detailed'] = "human"
+                    dct['index'] = (1,0)
+                elif sub.startswith("human---"):
+                    dct['label'] = "human+AI"
+                    model = sub.split("---")[1]
+                    dct['label_detailed'] = model
+                    dct['index'] = (1, 1)
+                else:
+                    dct['label'] = "AI"
+                    dct['label_detailed'] = sub
+                    dct['index'] = (0, 10^3)
+                dataset[key_name].append(dct)
+    return dataset
+def load_dataset_conditional_lang(path=None, language='vi', seed=42):
+    dataset = {
+        "train": [],
+        "val": [],
+        "test": []
+    }
+    combined_data = []
+    random.seed(seed)  # for reproducibility
+    folder = os.listdir(path)
+    print("Subfolders:", folder)
+    for sub in folder:
+        sub_path = os.path.join(path, sub)
+        if not os.path.isdir(sub_path):
+            continue
+        files = os.listdir(sub_path)
+        for file in files:
+            if not file.endswith('.jsonl') or language not in file:
+                continue
+            file_path = os.path.join(sub_path, file)
+            with open(file_path, 'r', encoding='utf-8') as f:
+                data = [json.loads(line) for line in f]
+            for entry in data:
+                if 'content' not in entry:
+                    print("Key does not exist!")
+                    continue
+                dct = {}
+                dct['text'] = entry['content']
+                if sub == "human":
+                    dct['label'] = "human"
+                    dct['label_detailed'] = "human"
+                    dct['index'] = (1, 0, 0)
+                elif sub == "human+AI":
+                    model = entry['label_detailed'].split("+")[1]
+                    dct['label'] = "human+AI"
+                    dct['label_detailed'] = model
+                    dct['index'] = (1, 1, model_map[model])
+                else:
+                    dct['label'] = "AI"
+                    dct['label_detailed'] = entry['label_detailed']
+                    dct['index'] = (0, 10**3, model_map[entry['label_detailed']])
+                combined_data.append(dct)
+    random.shuffle(combined_data)
+    total = len(combined_data)
+    train_end = int(total * 0.9)
+    val_end = train_end + int(total * 0.05)
+    dataset['train'] = combined_data[:train_end]
+    dataset['val'] = combined_data[train_end:val_end]
+    dataset['test'] = combined_data[val_end:]
+    print(f"Total: {total} | Train: {len(dataset['train'])} | Val: {len(dataset['val'])} | Test: {len(dataset['test'])}")
+    return dataset
+class TextDataset(Dataset):
+    def __init__(self, dataset,need_ids=True,out_domain=0):
+        self.dataset = dataset
+        self.need_ids=need_ids
+        self.out_domain = out_domain
+    def get_class(self):
+        return self.classes
+    def __len__(self):
+        return len(self.dataset)
+    def __getitem__(self, idx):
+        text, label, label_detailed, index = self.dataset[idx].values()
+        id = stable_long_hash(text)
+        if self.out_domain:
+            label, is_mixed = index
+            if self.need_ids:
+                return int(id), text, int(label), int(is_mixed)
+            return text, int(label), int(is_mixed)
+        else:
+            label, is_mixed, write_model = index
+            if self.need_ids:
+                return int(id), text, int(label), int(is_mixed), int(write_model)
+            return text, int(label), int(is_mixed), int(write_model)

utils/utils.py ADDED Viewed

	@@ -0,0 +1,132 @@

+from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, mean_squared_error, mean_absolute_error, hamming_loss
+import numpy as np
+from sklearn.preprocessing import MultiLabelBinarizer
+def find_top_n(embeddings,n,index,data):
+    if len(embeddings.shape) == 1:
+        embeddings = embeddings.reshape(1, -1)
+    top_ids_and_scores = index.search_knn(embeddings, n)
+    data_ans=[]
+    for i, (ids, scores) in enumerate(top_ids_and_scores):
+        data_now=[]
+        for id in ids:
+            data_now.append((data[0][int(id)],data[1][int(id)],data[2][int(id)]))
+        data_ans.append(data_now)
+    return data_ans
+def print_line(class_name, metrics, is_header=False):
+    if is_header:
+        line = f"| {'Class':<10} | " + " | ".join([f"{metric:<10}" for metric in metrics])
+    else:
+        line = f"| {class_name:<10} | " + " | ".join([f"{metrics[metric]:<10.3f}" for metric in metrics])
+    print(line)
+    if is_header:
+        print('-' * len(line))
+def calculate_per_class_metrics(classes, ground_truth, predictions):
+    # Convert ground truth and predictions to numeric format
+    gt_numeric = np.array([int(gt) for gt in ground_truth])
+    pred_numeric = np.array([int(pred) for pred in predictions])
+    results = {}
+    for i, class_name in enumerate(classes):
+        # For each class, calculate the 'vs rest' binary labels
+        gt_binary = (gt_numeric == i).astype(int)
+        pred_binary = (pred_numeric == i).astype(int)
+        # Calculate metrics, handling cases where a class is not present in predictions or ground truth
+        precision = precision_score(gt_binary, pred_binary, zero_division=0)
+        recall = recall_score(gt_binary, pred_binary, zero_division=0)
+        f1 = f1_score(gt_binary, pred_binary, zero_division=0)
+        acc = np.mean(gt_binary == pred_binary)
+        # Calculate recall for all other classes as 'rest'
+        rest_recall = recall_score(1 - gt_binary, 1 - pred_binary, zero_division=0)
+        results[class_name] = {
+            'Precision': precision,
+            'Recall': recall,
+            'F1 Score': f1,
+            'Accuracy': acc,
+            'Avg Recall (with rest)': (recall + rest_recall) / 2
+        }
+    print_line("Metric", results[classes[0]], is_header=True)
+    for class_name, metrics in results.items():
+        print_line(class_name, metrics)
+    overall_metrics = {metric_name: np.mean([metrics[metric_name] for metrics in results.values()]) for metric_name in results[classes[0]].keys()}
+    print_line("Overall", overall_metrics)
+def calculate_metrics(y_true, y_pred):
+    accuracy = accuracy_score(y_true, y_pred)
+    avg_f1 = f1_score(y_true, y_pred, average='macro')
+    avg_recall = recall_score(y_true, y_pred, average='macro')
+    return accuracy, avg_f1,avg_recall
+def compute_three_recalls(labels, preds):
+    all_n, all_p, tn, tp = 0, 0, 0, 0
+    for label, pred in zip(labels, preds):
+        if label == '0':
+            all_p += 1
+        if label == '1':
+            all_n += 1
+        if pred is not None and label == pred == '0':
+            tp += 1
+        if pred is not None and label == pred == '1':
+            tn += 1
+        if pred is None:
+            continue
+    machine_rec , human_rec= tp * 100 / all_p if all_p != 0 else 0, tn * 100 / all_n if all_n != 0 else 0
+    avg_rec = (human_rec + machine_rec) / 2
+    return (human_rec, machine_rec, avg_rec)
+def compute_metrics(labels, preds,ids=None, full_labels=False):
+    if ids is not None:
+        # unique ids
+        dict_labels,dict_preds={},{}
+        for i in range(len(ids)):
+            dict_labels[ids[i]]=labels[i]
+            dict_preds[ids[i]]=preds[i]
+        labels=list(dict_labels.values())
+        preds=list(dict_preds.values())
+    if not full_labels:
+        labels_map = {(1,0): 0, (0,10^3): 1, (1,1): 2}
+        labels_bin = [labels_map[tup] for tup in labels]
+        preds_bin = [labels_map[tup] for tup in preds]
+    else:
+        labels_map ={
+            (1, 0, 0): 0,  # Human
+            (0, 10^3, 1): 1, (0, 10^3, 2): 2, (0, 10^3, 3): 3, (0, 10^3, 4): 4,  # AI
+            (1, 1, 1): 5, (1, 1, 2): 6, (1, 1, 3): 7, (1, 1, 4): 8  # Human+AI
+        }
+        labels_bin = [labels_map[tup] for tup in labels]
+        preds_bin = [labels_map[tup] for tup in preds]
+    acc = accuracy_score(labels_bin, preds_bin)
+    precision = precision_score(labels_bin, preds_bin, average="macro")
+    recall = recall_score(labels_bin, preds_bin, average="macro")
+    f1 = f1_score(labels_bin, preds_bin, average="macro")
+    mse = mean_squared_error(labels_bin, preds_bin)
+    mae = mean_absolute_error(labels_bin, preds_bin)
+    return (acc, precision, recall, f1, mse, mae)
+def compute_metrics_train(labels, preds,ids=None):
+    if ids is not None:
+        # unique ids
+        dict_labels,dict_preds={},{}
+        for i in range(len(ids)):
+            dict_labels[ids[i]]=labels[i]
+            dict_preds[ids[i]]=preds[i]
+        labels=list(dict_labels.values())
+        preds=list(dict_preds.values())
+    human_rec, machine_rec, avg_rec = compute_three_recalls(labels, preds)
+    acc = accuracy_score(labels, preds)
+    precision = precision_score(labels, preds, average="macro")
+    recall = recall_score(labels, preds, average="macro")
+    f1 = f1_score(labels, preds, average="macro")
+    return (human_rec, machine_rec, avg_rec, acc, precision, recall, f1)