nicholasKluge commited on
Commit
dc1fcc4
·
verified ·
1 Parent(s): 43ce92f

Delete create-tfidf-matrix.py

Browse files
Files changed (1) hide show
  1. create-tfidf-matrix.py +0 -46
create-tfidf-matrix.py DELETED
@@ -1,46 +0,0 @@
1
- import joblib
2
- import pandas as pd
3
- from sklearn.metrics.pairwise import cosine_similarity
4
- from sklearn.feature_extraction.text import TfidfVectorizer
5
- import argparse
6
-
7
- def main():
8
- parser = argparse.ArgumentParser(description='Process some integers.')
9
- parser.add_argument('--input', type=str, help="Input file path (file should be in parquet format and have 'prompt' and 'completion' columns)")
10
- parser.add_argument('--output', type=str, help='Output file path')
11
- args = parser.parse_args()
12
-
13
- df = pd.read_parquet(args.input)
14
-
15
- # fit the vectorizer on the prompt column
16
- prompt_tfidf_vectorizer = TfidfVectorizer()
17
- prompt_tfidf_vectorizer.fit(df['prompt'])
18
-
19
- # save the vectorizer
20
- joblib.dump(prompt_tfidf_vectorizer, args.output + 'prompt-vectorizer.pkl')
21
-
22
- # get the tfidf_matrix
23
- prompt_tfidf_matrix = prompt_tfidf_vectorizer.transform(df['prompt'])
24
-
25
- # save the tfidf_matrix
26
- joblib.dump(prompt_tfidf_matrix, args.output + 'prompt-tfidf_matrix.pkl')
27
-
28
- # fit the vectorizer on the completion column
29
- completion_tfidf_vectorizer = TfidfVectorizer()
30
- completion_tfidf_vectorizer.fit(df['completion'])
31
-
32
- # save the vectorizer
33
- joblib.dump(completion_tfidf_vectorizer, args.output + 'completion-vectorizer.pkl')
34
-
35
- # get the tfidf_matrix
36
- completion_tfidf_matrix = completion_tfidf_vectorizer.transform(df['completion'])
37
-
38
- # save the tfidf_matrix
39
- joblib.dump(completion_tfidf_matrix, args.output + 'completion_tfidf-matrix.pkl')
40
-
41
- print("Done!")
42
-
43
- if __name__ == '__main__':
44
- main()
45
-
46
- # example usage: python create-tfidf-matrix.py --input fine-tuning-data.parquet --output ./