Update README.md
Browse files
README.md
CHANGED
@@ -158,6 +158,49 @@ topic_model.get_topic_info()
|
|
158 |
|
159 |
</details>
|
160 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
161 |
## Training hyperparameters
|
162 |
|
163 |
* calculate_probabilities: False
|
|
|
158 |
|
159 |
</details>
|
160 |
|
161 |
+
|
162 |
+
## Training Procedure
|
163 |
+
|
164 |
+
The model was trained as follows:
|
165 |
+
|
166 |
+
```python
|
167 |
+
from bertopic import BERTopic
|
168 |
+
from sklearn.feature_extraction.text import CountVectorizer
|
169 |
+
from bertopic.representation import KeyBERTInspired
|
170 |
+
|
171 |
+
from sentence_transformers import SentenceTransformer
|
172 |
+
from umap import UMAP
|
173 |
+
from hdbscan import HDBSCAN
|
174 |
+
|
175 |
+
# Prepre sub-models
|
176 |
+
embedding_model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
|
177 |
+
umap_model = UMAP(n_components=5, n_neighbors=50, random_state=42, metric="cosine", verbose=True)
|
178 |
+
hdbscan_model = HDBSCAN(min_samples=20, gen_min_span_tree=True, prediction_data=False, min_cluster_size=20)
|
179 |
+
vectorizer_model = CountVectorizer(stop_words="english", ngram_range=(1, 3), min_df=5)
|
180 |
+
|
181 |
+
# Representation models
|
182 |
+
representation_models = {"KeyBERTInspired": KeyBERTInspired()}
|
183 |
+
|
184 |
+
# Fit BERTopic
|
185 |
+
topic_model = BERTopic(
|
186 |
+
umap_model=umap_model,
|
187 |
+
hdbscan_model=hdbscan_model,
|
188 |
+
vectorizer_model=vectorizer_model,
|
189 |
+
representation_model=representation_models,
|
190 |
+
min_topic_size= 10,
|
191 |
+
n_gram_range= (1, 1),
|
192 |
+
nr_topics=None,
|
193 |
+
seed_topic_list=None,
|
194 |
+
top_n_words=10,
|
195 |
+
calculate_probabilities=False,
|
196 |
+
language=None,
|
197 |
+
verbose = True
|
198 |
+
).fit(docs)
|
199 |
+
|
200 |
+
|
201 |
+
```
|
202 |
+
|
203 |
+
|
204 |
## Training hyperparameters
|
205 |
|
206 |
* calculate_probabilities: False
|