# find_related.py import os import pickle import re import torch import threading from datetime import datetime, timedelta from enum import Enum from sentence_transformers import SentenceTransformer, util from fastapi import APIRouter try: from .utils_gitea import gitea_fetch_issues, gitea_json_issue_get except: from utils_gitea import gitea_fetch_issues, gitea_json_issue_get def _create_issue_string(title, body): cleaned_body = body.replace('\r', '') cleaned_body = cleaned_body.replace('**System Information**\n', '') cleaned_body = cleaned_body.replace('**Blender Version**\n', '') cleaned_body = cleaned_body.replace( 'Worked: (newest version of Blender that worked as expected)\n', '') cleaned_body = cleaned_body.replace('**Short description of error**\n', '') cleaned_body = cleaned_body.replace('**Addon Information**\n', '') cleaned_body = cleaned_body.replace( '**Exact steps for others to reproduce the error**\n', '') cleaned_body = cleaned_body.replace( '[Please describe the exact steps needed to reproduce the issue]\n', '') cleaned_body = cleaned_body.replace( '[Please fill out a short description of the error here]\n', '') cleaned_body = cleaned_body.replace( '[Based on the default startup or an attached .blend file (as simple as possible)]\n', '') cleaned_body = re.sub( r', branch: .+?, commit date: \d{4}-\d{2}-\d{2} \d{2}:\d{2}, hash: `.+?`', '', cleaned_body) cleaned_body = re.sub( r'\/?attachments\/[a-zA-Z0-9\-]+', 'attachment', cleaned_body) cleaned_body = re.sub( r'https?:\/\/[^\s/]+(?:\/[^\s/]+)*\/([^\s/]+)', lambda match: match.group(1), cleaned_body) return title + '\n' + cleaned_body def _find_latest_date(issues, default_str=None): # Handle the case where 'issues' is empty if not issues: return default_str return max((issue['updated_at'] for issue in issues), default=default_str) class EmbeddingContext: # These don't change TOKEN_LEN_MAX_FOR_EMBEDDING = 512 TOKEN_LEN_MAX_BALCKLIST = 2 * TOKEN_LEN_MAX_FOR_EMBEDDING ARRAY_CHUNK_SIZE = 4096 issue_attr_filter = {'number', 'title', 'body', 'state', 'updated_at'} cache_path = "routers/tool_find_related_cache.pkl" # Set when creating the object lock = None model = None openai_client = None model_name = '' config_type = '' embedding_shape = None embedding_dtype = None embedding_device = None # Updates constantly data = {} black_list = {'blender': {109399, 113157, 114706}, 'blender-addons': set()} def __init__(self): self.lock = threading.Lock() try: from config import settings except: import sys sys.path.append(os.path.abspath( os.path.join(os.path.dirname(__file__), '..'))) from config import settings config_type = settings.embedding_api model_name = settings.embedding_model if config_type == 'sbert': self.model = SentenceTransformer(model_name, use_auth_token=False) self.model.max_seq_length = self.TOKEN_LEN_MAX_FOR_EMBEDDING print("Max Sequence Length:", self.model.max_seq_length) self.encode = self.encode_sbert if torch.cuda.is_available(): self.model = self.model.to('cuda') elif config_type == 'openai': from openai import OpenAI self.openai_client = OpenAI( # base_url = settings.openai_api_base api_key=settings.OPENAI_API_KEY, ) self.encode = self.encode_openai self.model_name = model_name self.config_type = config_type tmp = self.encode(['tmp']) self.embedding_shape = tmp.shape[1:] self.embedding_dtype = tmp.dtype self.embedding_device = tmp.device def encode(self, texts_to_embed): pass def encode_sbert(self, texts_to_embed): return self.model.encode(texts_to_embed, show_progress_bar=True, convert_to_tensor=True, normalize_embeddings=True) def encode_openai(self, texts_to_embed): import math import time tokens_count = 0 for text in texts_to_embed: tokens_count += len(self.get_tokens(text)) chunks_num = math.ceil(tokens_count / 500000) chunk_size = math.ceil(len(texts_to_embed) / chunks_num) embeddings = [] for i in range(chunks_num): start = i * chunk_size end = start + chunk_size chunk = texts_to_embed[start:end] embeddings_tmp = self.openai_client.embeddings.create( model=self.model_name, input=chunk, ).data if embeddings_tmp is None: break embeddings.extend(embeddings_tmp) if i < chunks_num - 1: time.sleep(60) # Wait 1 minute before the next call return torch.stack([torch.tensor(embedding.embedding, dtype=torch.float32) for embedding in embeddings]) def get_tokens(self, text): if self.model: return self.model.tokenizer.tokenize(text) tokens = [] for token in re.split(r'(\W|\b)', text): if token.strip(): tokens.append(token) return tokens def create_strings_to_embbed(self, issues, black_list): texts_to_embed = [_create_issue_string( issue['title'], issue['body']) for issue in issues] # Create issue blacklist (for keepping track) token_count = 0 for i, text in enumerate(texts_to_embed): tokens = self.get_tokens(text) tokens_len = len(tokens) token_count += tokens_len if tokens_len > self.TOKEN_LEN_MAX_BALCKLIST: # Only use the first TOKEN_LEN_MAX tokens black_list.add(int(issues[i]['number'])) if self.config_type == 'openai': texts_to_embed[i] = ' '.join( tokens[:self.TOKEN_LEN_MAX_BALCKLIST]) return texts_to_embed def data_ensure_size(self, repo, size_new): updated_at_old = None arrays_size_old = 0 titles_old = [] try: arrays_size_old = self.data[repo]['arrays_size'] if size_new <= arrays_size_old: return except: pass arrays_size_new = self.ARRAY_CHUNK_SIZE * \ (int(size_new / self.ARRAY_CHUNK_SIZE) + 1) data_new = { 'updated_at': updated_at_old, 'arrays_size': arrays_size_new, 'titles': titles_old + [None] * (arrays_size_new - arrays_size_old), 'embeddings': torch.empty((arrays_size_new, *self.embedding_shape), dtype=self.embedding_dtype, device=self.embedding_device), 'opened': torch.zeros(arrays_size_new, dtype=torch.bool), 'closed': torch.zeros(arrays_size_new, dtype=torch.bool), } try: data_new['embeddings'][:arrays_size_old] = self.data[repo]['embeddings'] data_new['opened'][:arrays_size_old] = self.data[repo]['opened'] data_new['closed'][:arrays_size_old] = self.data[repo]['closed'] except: pass self.data[repo] = data_new def embeddings_generate(self, repo): if os.path.exists(self.cache_path): with open(self.cache_path, 'rb') as file: self.data = pickle.load(file) if repo in self.data: return if not repo in self.black_list: self.black_list[repo] = {} black_list = self.black_list[repo] issues = gitea_fetch_issues('blender', repo, state='all', since=None, issue_attr_filter=self.issue_attr_filter, exclude=black_list) # issues = sorted(issues, key=lambda issue: int(issue['number'])) print("Embedding Issues...") texts_to_embed = self.create_strings_to_embbed(issues, black_list) embeddings = self.encode(texts_to_embed) self.data_ensure_size(repo, int(issues[0]['number'])) self.data[repo]['updated_at'] = _find_latest_date(issues) titles = self.data[repo]['titles'] embeddings_new = self.data[repo]['embeddings'] opened = self.data[repo]['opened'] closed = self.data[repo]['closed'] for i, issue in enumerate(issues): number = int(issue['number']) titles[number] = issue['title'] embeddings_new[number] = embeddings[i] if issue['state'] == 'open': opened[number] = True if issue['state'] == 'closed': closed[number] = True def embeddings_updated_get(self, repo): with self.lock: try: data = self.data[repo] except: self.embeddings_generate(repo) data = self.data[repo] black_list = self.black_list[repo] date_old = data['updated_at'] issues = gitea_fetch_issues( 'blender', repo, since=date_old, issue_attr_filter=self.issue_attr_filter, exclude=black_list) # Get the most recent date date_new = _find_latest_date(issues, date_old) if date_new == date_old: # Nothing changed return data data['updated_at'] = date_new # autopep8: off # WORKAROUND: # Consider that if the time hasn't changed, it's the same issue. issues = [issue for issue in issues if issue['updated_at'] != date_old] self.data_ensure_size(repo, int(issues[0]['number'])) texts_to_embed = self.create_strings_to_embbed(issues, black_list) embeddings = self.encode(texts_to_embed) for i, issue in enumerate(issues): number = int(issue['number']) data['titles'][number] = issue['title'] data['embeddings'][number] = embeddings[i] if issue['state'] == 'open': data['opened'][number] = True if issue['state'] == 'closed': data['closed'][number] = True # autopep8: on return data router = APIRouter() EMBEDDING_CTX = EmbeddingContext() # EMBEDDING_CTX.embeddings_generate('blender', 'blender') # EMBEDDING_CTX.embeddings_generate('blender', 'blender-addons') # Define your Enum class class State(str, Enum): opened = "opened" closed = "closed" all = "all" def _sort_similarity(data: dict, query_emb: torch.Tensor, limit: int, state: State = State.opened) -> list: duplicates = [] embeddings = data['embeddings'] true_indices = None if state != State.all: mask = data[state.value] embeddings = embeddings[mask] true_indices = mask.nonzero(as_tuple=True)[0] ret = util.semantic_search( query_emb, embeddings, top_k=limit, score_function=util.dot_score) for score in ret[0]: corpus_id = score['corpus_id'] number = true_indices[corpus_id].item( ) if true_indices is not None else corpus_id text = f"#{number}: {data['titles'][number]}" duplicates.append(text) return duplicates def find_relatedness(repo: str, number: int, limit: int = 20, state: State = State.opened): data = EMBEDDING_CTX.embeddings_updated_get(repo) # Check if the embedding already exists. if data['titles'][number] is not None: new_embedding = data['embeddings'][number] else: gitea_issue = gitea_json_issue_get('blender', repo, number) text_to_embed = _create_issue_string( gitea_issue['title'], gitea_issue['body']) new_embedding = EMBEDDING_CTX.encode([text_to_embed]) duplicates = _sort_similarity( data, new_embedding, limit=limit, state=state) if not duplicates: return '' number_cached = int(re.search(r'#(\d+):', duplicates[0]).group(1)) if number_cached == number: return '\n'.join(duplicates[1:]) return '\n'.join(duplicates) @router.get("/find_related/{repo}/{number}") def find_related(repo: str = 'blender', number: int = 104399, limit: int = 15, state: State = State.opened): related = find_relatedness(repo, number, limit=limit, state=state) return related if __name__ == "__main__": update_cache = True if update_cache: EMBEDDING_CTX.embeddings_updated_get('blender') EMBEDDING_CTX.embeddings_updated_get('blender-addons') cache_path = EMBEDDING_CTX.cache_path with open(cache_path, "wb") as file: # Converting the embeddings to be CPU compatible, as the virtual machine in use currently only supports the CPU. for val in EMBEDDING_CTX.data.values(): val['embeddings'] = val['embeddings'].to(torch.device('cpu')) pickle.dump(EMBEDDING_CTX.data, file, protocol=pickle.HIGHEST_PROTOCOL) else: # Converting the embeddings to be GPU. for val in EMBEDDING_CTX.data.values(): val['embeddings'] = val['embeddings'].to(torch.device('cuda')) # 'blender/blender/111434' must print #96153, #83604 and #79762 related1 = find_relatedness('blender', 111434, limit=20) related2 = find_relatedness('blender-addons', 104399, limit=20) print("These are the 20 most related issues:") print(related1) print() print("These are the 20 most related issues:") print(related2)