Germano Cavalcante commited on
Commit
91ad34e
·
1 Parent(s): 02673b4

Add Utils for generate documentantion

Browse files

Currently `utils/generate_blender_doc.py` is not used because
"mano-wii/BAAI_bge-base-en-v1.5-tunned-for-blender-issues" is not
trained to get good scores for terms.

routers/tool_find_related.py CHANGED
@@ -11,14 +11,8 @@ from fastapi import APIRouter
11
 
12
  try:
13
  from .utils_gitea import gitea_fetch_issues, gitea_json_issue_get
14
- from config import settings
15
  except:
16
- import sys
17
  from utils_gitea import gitea_fetch_issues, gitea_json_issue_get
18
- sys.path.append(os.path.abspath(
19
- os.path.join(os.path.dirname(__file__), '..')))
20
- from config import settings
21
-
22
 
23
  def _create_issue_string(title, body):
24
  cleaned_body = body.replace('\r', '')
@@ -76,6 +70,14 @@ class EmbeddingContext:
76
  def __init__(self):
77
  self.lock = threading.Lock()
78
 
 
 
 
 
 
 
 
 
79
  config_type = settings.embedding_api
80
  model_name = settings.embedding_model
81
 
@@ -162,8 +164,9 @@ class EmbeddingContext:
162
  if tokens_len > self.TOKEN_LEN_MAX_BALCKLIST:
163
  # Only use the first TOKEN_LEN_MAX tokens
164
  black_list.add(int(issues[i]['number']))
165
- texts_to_embed[i] = ' '.join(
166
- tokens[:self.TOKEN_LEN_MAX_BALCKLIST])
 
167
 
168
  return texts_to_embed
169
 
 
11
 
12
  try:
13
  from .utils_gitea import gitea_fetch_issues, gitea_json_issue_get
 
14
  except:
 
15
  from utils_gitea import gitea_fetch_issues, gitea_json_issue_get
 
 
 
 
16
 
17
  def _create_issue_string(title, body):
18
  cleaned_body = body.replace('\r', '')
 
70
  def __init__(self):
71
  self.lock = threading.Lock()
72
 
73
+ try:
74
+ from config import settings
75
+ except:
76
+ import sys
77
+ sys.path.append(os.path.abspath(
78
+ os.path.join(os.path.dirname(__file__), '..')))
79
+ from config import settings
80
+
81
  config_type = settings.embedding_api
82
  model_name = settings.embedding_model
83
 
 
164
  if tokens_len > self.TOKEN_LEN_MAX_BALCKLIST:
165
  # Only use the first TOKEN_LEN_MAX tokens
166
  black_list.add(int(issues[i]['number']))
167
+ if self.config_type == 'openai':
168
+ texts_to_embed[i] = ' '.join(
169
+ tokens[:self.TOKEN_LEN_MAX_BALCKLIST])
170
 
171
  return texts_to_embed
172
 
routers/tool_find_related_cache.pkl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bcfc6263dd6858337017723f80b66c7c8f657ad9675b0263ea30f3940291c3b6
3
- size 21219373
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1400db7078bac757625cc72f04d329a218638b3a267c22c2f20a2bba9a52e787
3
+ size 21426824
utils/generate_blender_doc.py ADDED
@@ -0,0 +1,194 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import re
4
+ from sentence_transformers import util
5
+
6
+ script_dir = os.path.dirname(os.path.realpath(__file__))
7
+ parent_dir = os.path.dirname(script_dir)
8
+ sys.path.append(parent_dir)
9
+
10
+ # autopep8: off
11
+ from routers.tool_find_related import EMBEDDING_CTX
12
+ # autopep8: on
13
+
14
+ MANUAL_DIR = "D:/BlenderDev/blender-manual/manual/"
15
+ BASE_URL = "https://docs.blender.org/manual/en/dev"
16
+
17
+
18
+ def process_text(text):
19
+ # Remove repeated characters
20
+ text = re.sub(r'%{2,}', '', text)
21
+ text = re.sub(r'#{2,}', '', text)
22
+ text = re.sub(r'={3,}', '', text)
23
+ text = re.sub(r'\*{3,}', '', text)
24
+ text = re.sub(r'\^{3,}', '', text)
25
+ text = re.sub(r'-{3,}', '', text)
26
+
27
+ # Remove patterns ".. word:: " and ":word:"
28
+ text = re.sub(r'\.\. \S+', '', text)
29
+ text = re.sub(r':\w+:', '', text)
30
+
31
+ text = re.sub(r'(\s*\n\s*)+', '\n', text)
32
+ return text
33
+
34
+
35
+ def parse_file(filedir, filename):
36
+ with open(os.path.join(filedir, filename), 'r', encoding='utf-8') as file:
37
+ content = file.read()
38
+
39
+ parsed_data = {}
40
+
41
+ if not filename.endswith('index.rst'):
42
+ body = content.strip()
43
+ else:
44
+ parts = content.split(".. toctree::")
45
+ body = parts[0].strip()
46
+
47
+ if len(parts) > 1:
48
+ parsed_data["toctree"] = {}
49
+ for part in parts[1:]:
50
+ toctree_entries = part.split('\n')
51
+ line = toctree_entries[0]
52
+ for entry in toctree_entries[1:]:
53
+ entry = entry.strip()
54
+ if not entry:
55
+ continue
56
+
57
+ if entry.startswith('/'):
58
+ # relative path.
59
+ continue
60
+
61
+ if not entry.endswith('.rst'):
62
+ continue
63
+
64
+ if entry.endswith('/index.rst'):
65
+ entry_name = entry[:-10]
66
+ filedir_ = os.path.join(filedir, entry_name)
67
+ filename_ = 'index.rst'
68
+ else:
69
+ entry_name = entry[:-4]
70
+ filedir_ = filedir
71
+ filename_ = entry
72
+
73
+ parsed_data['toctree'][entry_name] = parse_file(
74
+ filedir_, filename_)
75
+
76
+ processed_text = process_text(body)
77
+ tokens = EMBEDDING_CTX.model.tokenizer.tokenize(processed_text)
78
+ if len(tokens) > EMBEDDING_CTX.model.max_seq_length:
79
+ pass
80
+ # parsed_data['body'] = body
81
+ parsed_data['processed_text'] = processed_text
82
+ parsed_data['n_tokens'] = len(tokens)
83
+
84
+ return parsed_data
85
+
86
+
87
+ # Function to split the text into chunks of a maximum number of tokens
88
+ def split_into_many(text, max_tokens):
89
+
90
+ # Split the text into sentences
91
+ paragraphs = text.split('.\n')
92
+
93
+ # Get the number of tokens for each sentence
94
+ n_tokens = [len(EMBEDDING_CTX.model.tokenizer.tokenize(" " + sentence))
95
+ for sentence in paragraphs]
96
+
97
+ chunks = []
98
+ tokens_so_far = 0
99
+ chunk = []
100
+
101
+ # Loop through the sentences and tokens joined together in a tuple
102
+ for sentence, token in zip(paragraphs, n_tokens):
103
+
104
+ # If the number of tokens so far plus the number of tokens in the current sentence is greater
105
+ # than the max number of tokens, then add the chunk to the list of chunks and reset
106
+ # the chunk and tokens so far
107
+ if tokens_so_far + token > max_tokens:
108
+ chunks.append((".\n".join(chunk) + ".", tokens_so_far))
109
+ chunk = []
110
+ tokens_so_far = 0
111
+
112
+ # If the number of tokens in the current sentence is greater than the max number of
113
+ # tokens, go to the next sentence
114
+ if token > max_tokens:
115
+ continue
116
+
117
+ # Otherwise, add the sentence to the chunk and add the number of tokens to the total
118
+ chunk.append(sentence)
119
+ tokens_so_far += token + 1
120
+
121
+ if chunk:
122
+ chunks.append((".\n".join(chunk) + ".", tokens_so_far))
123
+
124
+ return chunks
125
+
126
+
127
+ def get_texts(data, path):
128
+ result = []
129
+ processed_texts = [data['processed_text']]
130
+ processed_tokens = [data['n_tokens']]
131
+ max_tokens = EMBEDDING_CTX.model.max_seq_length
132
+
133
+ data_ = data
134
+ for key in path:
135
+ data_ = data_['toctree'][key]
136
+ processed_texts.append(data_['processed_text'])
137
+ processed_tokens.append(data_['n_tokens'])
138
+
139
+ if processed_tokens[-1] > max_tokens:
140
+ chunks = split_into_many(processed_texts[-1], max_tokens)
141
+ else:
142
+ chunks = [(processed_texts[-1], processed_tokens[-1])]
143
+
144
+ for text, n_tokens in chunks:
145
+ # Add context to the text if we have space
146
+ for i in range(len(processed_texts) - 2, -1, -1):
147
+ n_tokens_parent = processed_tokens[i]
148
+ if n_tokens + n_tokens_parent >= max_tokens:
149
+ break
150
+
151
+ text_parent = processed_texts[i]
152
+ text = text_parent + '\n' + text
153
+ n_tokens += n_tokens_parent
154
+
155
+ result.append([path, text])
156
+
157
+ try:
158
+ for key in data_['toctree'].keys():
159
+ result.extend(get_texts(data, path + [key]))
160
+ except KeyError:
161
+ pass
162
+
163
+ return result
164
+
165
+
166
+ def _sort_similarity(chunks, embeddings, text_to_search, limit):
167
+ results = []
168
+
169
+ query_emb = EMBEDDING_CTX.encode([text_to_search])
170
+ ret = util.semantic_search(
171
+ query_emb, embeddings, top_k=limit, score_function=util.dot_score)
172
+
173
+ for score in ret[0]:
174
+ corpus_id = score['corpus_id']
175
+ chunk = chunks[corpus_id]
176
+ path = chunk[0]
177
+ results.append(path)
178
+
179
+ return results
180
+
181
+
182
+ if __name__ == '__main__':
183
+ # path = 'addons/3d_view'
184
+ data = parse_file(MANUAL_DIR, 'index.rst')
185
+ data['toctree']["copyright"] = parse_file(MANUAL_DIR, 'copyright.rst')
186
+
187
+ # Create a list to store the text files
188
+ chunks = []
189
+ chunks.extend(get_texts(data, []))
190
+
191
+ embeddings = EMBEDDING_CTX.encode([text for path, text in chunks])
192
+
193
+ result = _sort_similarity(chunks, embeddings, "Set Snap Base", 50)
194
+ print(result)
utils/generate_bpy_doc.py ADDED
@@ -0,0 +1,224 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import bpy
2
+ import inspect
3
+ import time
4
+ import pickle
5
+ import mathutils
6
+ import os
7
+ import bpy_types
8
+ import addon_utils
9
+ import sys
10
+
11
+ INFO_MEMBER = "__info"
12
+
13
+
14
+ def get_info(name="", descr="", bases=None):
15
+ return {"name": name,
16
+ "descr": descr,
17
+ "bases": bases}
18
+
19
+ ##################################################################
20
+
21
+
22
+ g_bpy_types = {}
23
+
24
+
25
+ def doc_from_bpy_struct(bl_rna):
26
+ bases = []
27
+ try:
28
+ base = bl_rna.base
29
+ while base:
30
+ bases.append(type(base).__name__)
31
+ base = base.base
32
+ except:
33
+ if not bases:
34
+ bases = None
35
+
36
+ return get_info(name=bl_rna.name, descr=bl_rna.description, bases=bases)
37
+
38
+
39
+ def bpy_type_first_step(bpy_type):
40
+ def is_member_from_base_class(bpy_type, identifier):
41
+ if identifier in bpy.types.ID.bl_rna.properties:
42
+ return True
43
+
44
+ bases = bpy_type.mro()[1:]
45
+ for base in bases:
46
+ if not hasattr(base, "bl_rna"):
47
+ continue
48
+ if identifier in base.bl_rna.properties:
49
+ return True
50
+ return False
51
+
52
+ info = doc_from_bpy_struct(bpy_type.bl_rna)
53
+ data = {INFO_MEMBER: info}
54
+ for prop in bpy_type.bl_rna.properties:
55
+ identifier = prop.identifier
56
+ if is_member_from_base_class(bpy_type, identifier):
57
+ continue
58
+ if prop.type == 'POINTER':
59
+ srna_type = prop.fixed_type.identifier
60
+ try:
61
+ pointer_type = getattr(bpy.types, srna_type)
62
+ data[identifier] = pointer_type
63
+ except Exception:
64
+ pass
65
+ continue
66
+ if prop.type == 'COLLECTION':
67
+ if prop.srna:
68
+ srna_type = prop.srna.identifier
69
+ pointer_type = getattr(bpy.types, srna_type)
70
+ data[identifier] = pointer_type
71
+ elif srna_type := prop.fixed_type.identifier:
72
+ pointer_type = getattr(bpy.types, srna_type)
73
+ data[identifier] = [pointer_type]
74
+ continue
75
+
76
+ info_member = doc_from_bpy_struct(prop)
77
+ data[identifier] = {INFO_MEMBER: info_member}
78
+
79
+ return data
80
+
81
+
82
+ def bpy_types_first_step():
83
+ global g_bpy_types
84
+ for bpy_type_name in dir(bpy.types):
85
+ bpy_type = getattr(bpy.types, bpy_type_name)
86
+ if not hasattr(bpy_type, "bl_rna"):
87
+ continue
88
+ g_bpy_types[bpy_type] = bpy_type_first_step(bpy_type)
89
+
90
+
91
+ def bpy_types_second_step():
92
+ global g_bpy_types
93
+ for bpy_type, map in g_bpy_types.items():
94
+ for key, val in map.items():
95
+ if hasattr(val, "bl_rna"):
96
+ map[key] = g_bpy_types[val]
97
+ elif isinstance(val, list):
98
+ val[0] = g_bpy_types[val[0]]
99
+
100
+
101
+ ##################################################################
102
+
103
+ bases_builtin = {int, bool, float, str, bytes, tuple, list,
104
+ set, dict, mathutils.Vector, mathutils.Color, type(None)}
105
+
106
+
107
+ def is_member_inherited(obj, member):
108
+ mro_bases = inspect.getmro(type(obj))
109
+ mro_bases_set = set(mro_bases)
110
+ intersection = mro_bases_set.intersection(bases_builtin)
111
+ for base in intersection:
112
+ if hasattr(base, member):
113
+ return True
114
+ return False
115
+
116
+
117
+ def get_doc_recursive(parent, member):
118
+ ob = getattr(parent, member)
119
+ member_info = getattr(type(parent), member, ob)
120
+ if type(member_info) in bases_builtin or member == "bpy_func":
121
+ descr = type(member_info).__name__
122
+ return {INFO_MEMBER: get_info(descr=descr)}
123
+
124
+ if hasattr(type(ob), "bl_rna"):
125
+ return g_bpy_types[type(ob)]
126
+
127
+ if "bl_rna" in dir(ob):
128
+ return g_bpy_types[ob]
129
+
130
+ result = {}
131
+ descr = member_info.__doc__ if member_info.__doc__ else type(ob).__name__
132
+ result[INFO_MEMBER] = get_info(descr=descr)
133
+
134
+ for name in dir(ob):
135
+ if name.startswith("_"):
136
+ continue
137
+ if is_member_inherited(ob, name):
138
+ continue
139
+
140
+ ob_member = getattr(ob, name, None)
141
+ if ob_member == parent:
142
+ descr = type(parent).__name__
143
+ result[name] = {INFO_MEMBER: get_info(descr=descr)}
144
+ continue
145
+ if ob_member == os:
146
+ continue
147
+ if ob_member == bpy:
148
+ continue
149
+ if ob_member == bpy_types:
150
+ continue
151
+ if ob_member == addon_utils:
152
+ continue
153
+ if ob_member == sys:
154
+ continue
155
+ if name == "addon_install":
156
+ # This raises a Error
157
+ continue
158
+
159
+ result[name] = get_doc_recursive(ob, name)
160
+ return result
161
+
162
+
163
+ ##################################################################
164
+
165
+ def print_doc_recursive(map, indent, name, max_step=3):
166
+ time.sleep(.5)
167
+ prefix = indent * '|'
168
+ print(prefix + name)
169
+ for key, val in map.items():
170
+ if key == INFO_MEMBER:
171
+ print(prefix + val.replace('\n', '\n' + prefix) + '\n' + prefix)
172
+ elif indent < max_step:
173
+ name_next = name + '.' + key
174
+ if isinstance(val, list):
175
+ print_doc_recursive(val[0], indent + 1,
176
+ name_next + "[0]", max_step=max_step)
177
+ else:
178
+ print_doc_recursive(
179
+ val, indent + 1, name_next, max_step=max_step)
180
+
181
+
182
+ def main():
183
+ print("-------------------------------------------------------------")
184
+ bpy_types_first_step()
185
+ bpy_types_second_step()
186
+
187
+ members = (
188
+ "app",
189
+ "context",
190
+ "data",
191
+ "msgbus",
192
+ "ops",
193
+ "path",
194
+ "props",
195
+ "types",
196
+ "utils",
197
+ )
198
+
199
+ result = {
200
+ "bpy": {INFO_MEMBER: get_info(descr=bpy.__doc__)},
201
+ "__info": {"bases": None},
202
+ }
203
+ for member in members:
204
+ result["bpy"][member] = get_doc_recursive(bpy, member)
205
+
206
+ # Reference some types at the beginning
207
+ result["bpy_struct"] = result["bpy"]["types"]["bpy_struct"]
208
+ result["bpy_types"] = result["bpy"]["types"]
209
+
210
+ if False:
211
+ print(result["bpy"]["props"]["BoolProperty"])
212
+ return
213
+
214
+ # print_doc_recursive(result, 1, "bpy")
215
+ bpy_doc_dir = "D:/Dev/function-calling/routersbpy_doc_v41.pkl"
216
+ with open(bpy_doc_dir, "wb") as file:
217
+ # print(result["types"]["bpy_func"])
218
+ pickle.dump(result, file, protocol=pickle.HIGHEST_PROTOCOL)
219
+
220
+ print(f"File '{bpy_doc_dir}' has been updated.")
221
+
222
+
223
+ if __name__ == '__main__':
224
+ main()