Spaces:
Build error
Build error
Duplicate from Sybghat/resume-parser
Browse filesCo-authored-by: Sybghat Ullah <[email protected]>
- .gitattributes +34 -0
- Models.py +58 -0
- README.md +14 -0
- ResumeParser.py +258 -0
- ResumeReader.py +103 -0
- ResumeSegmenter.py +264 -0
- app.py +18 -0
- main.py +23 -0
- readMe.txt +3 -0
- requirements.txt +111 -0
.gitattributes
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
Models.py
ADDED
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import AutoTokenizer, AutoModelForTokenClassification, AutoModelForSequenceClassification
|
2 |
+
from transformers import pipeline
|
3 |
+
from flair.data import Sentence
|
4 |
+
from flair.models import SequenceTagger
|
5 |
+
import pickle
|
6 |
+
|
7 |
+
|
8 |
+
|
9 |
+
class Models:
|
10 |
+
|
11 |
+
def pickle_it(self, obj, file_name):
|
12 |
+
with open(f'{file_name}.pickle', 'wb') as f:
|
13 |
+
pickle.dump(obj, f)
|
14 |
+
|
15 |
+
def unpickle_it(self, file_name):
|
16 |
+
with open(f'{file_name}.pickle', 'rb') as f:
|
17 |
+
return pickle.load(f)
|
18 |
+
|
19 |
+
def load_trained_models(self, pickle=False):
|
20 |
+
#NER (dates)
|
21 |
+
tokenizer = AutoTokenizer.from_pretrained("Jean-Baptiste/camembert-ner-with-dates")
|
22 |
+
model = AutoModelForTokenClassification.from_pretrained("Jean-Baptiste/camembert-ner-with-dates")
|
23 |
+
self.ner_dates = pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy="simple")
|
24 |
+
|
25 |
+
#Zero Shot Classification
|
26 |
+
# self.zero_shot_classifier = pipeline("zero-shot-classification", model='facebook/bart-large-mnli')
|
27 |
+
self.zero_shot_classifier = pipeline("zero-shot-classification", model='valhalla/distilbart-mnli-12-6')
|
28 |
+
|
29 |
+
# Ner
|
30 |
+
tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
|
31 |
+
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")
|
32 |
+
self.ner = pipeline('ner', model=model, tokenizer=tokenizer, grouped_entities=True)
|
33 |
+
|
34 |
+
# Pos Tagging
|
35 |
+
self.tagger = SequenceTagger.load("flair/pos-english-fast")
|
36 |
+
|
37 |
+
|
38 |
+
if pickle:
|
39 |
+
self.pickle_models()
|
40 |
+
|
41 |
+
return self.ner, self.ner_dates, self.zero_shot_classifier, self.tagger
|
42 |
+
|
43 |
+
def pickle_models(self):
|
44 |
+
self.pickle_it(self.ner, "ner")
|
45 |
+
self.pickle_it(self.zero_shot_classifier, "zero_shot_classifier_6")
|
46 |
+
self.pickle_it(self.ner_dates, "ner_dates")
|
47 |
+
self.pickle_it(self.tagger, "pos_tagger_fast")
|
48 |
+
|
49 |
+
|
50 |
+
def load_pickled_models(self):
|
51 |
+
ner_dates = self.unpickle_it('ner_dates')
|
52 |
+
ner = self.unpickle_it('ner')
|
53 |
+
zero_shot_classifier = self.unpickle_it('zero_shot_classifier_6')
|
54 |
+
tagger = self.unpickle_it("pos_tagger_fast")
|
55 |
+
return ner_dates, ner, zero_shot_classifier, tagger
|
56 |
+
|
57 |
+
def get_flair_sentence(self, sent):
|
58 |
+
return Sentence(sent)
|
README.md
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: Resume Parser
|
3 |
+
emoji: 🏢
|
4 |
+
colorFrom: pink
|
5 |
+
colorTo: red
|
6 |
+
sdk: gradio
|
7 |
+
sdk_version: 3.9.1
|
8 |
+
app_file: app.py
|
9 |
+
pinned: false
|
10 |
+
license: openrail
|
11 |
+
duplicated_from: Sybghat/resume-parser
|
12 |
+
---
|
13 |
+
|
14 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
ResumeParser.py
ADDED
@@ -0,0 +1,258 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from Models import Models
|
2 |
+
from ResumeSegmenter import ResumeSegmenter
|
3 |
+
from datetime import datetime
|
4 |
+
from dateutil import parser
|
5 |
+
import re
|
6 |
+
from string import punctuation
|
7 |
+
|
8 |
+
class ResumeParser:
|
9 |
+
def __init__(self, ner, ner_dates, zero_shot_classifier, tagger):
|
10 |
+
self.models = Models()
|
11 |
+
self.segmenter = ResumeSegmenter(zero_shot_classifier)
|
12 |
+
self.ner, self.ner_dates, self.zero_shot_classifier, self.tagger = ner, ner_dates, zero_shot_classifier, tagger
|
13 |
+
self.parsed_cv = {}
|
14 |
+
|
15 |
+
def parse(self, resume_lines):
|
16 |
+
resume_segments = self.segmenter.segment(resume_lines)
|
17 |
+
print("***************************** Parsing the Resume...***************************** ")
|
18 |
+
for segment_name in resume_segments:
|
19 |
+
if segment_name == "work_and_employment":
|
20 |
+
resume_segment = resume_segments[segment_name]
|
21 |
+
self.parse_job_history(resume_segment)
|
22 |
+
elif segment_name == "contact_info":
|
23 |
+
contact_info = resume_segments[segment_name]
|
24 |
+
self.parse_contact_info(contact_info)
|
25 |
+
elif segment_name == "education_and_training":
|
26 |
+
education_and_training = resume_segments[segment_name]
|
27 |
+
self.parse_education(education_and_training)
|
28 |
+
elif segment_name == "skills_header":
|
29 |
+
skills_header = resume_segments[segment_name]
|
30 |
+
self.parse_skills(skills_header)
|
31 |
+
print("************************************** SKILLS HEADER ***************************** <br>",skills_header)
|
32 |
+
return self.parsed_cv
|
33 |
+
|
34 |
+
def parse_education(self, education_and_training):
|
35 |
+
print(education_and_training)
|
36 |
+
self.parsed_cv['Education'] = education_and_training
|
37 |
+
|
38 |
+
def parse_skills(self, skills_header):
|
39 |
+
self.parsed_cv['Skills'] = skills_header
|
40 |
+
|
41 |
+
def parse_contact_info(self, contact_info):
|
42 |
+
contact_info_dict = {}
|
43 |
+
name = self.find_person_name(contact_info)
|
44 |
+
email = self.find_contact_email(contact_info)
|
45 |
+
self.parsed_cv['Name'] = name
|
46 |
+
contact_info_dict["Email"] = email
|
47 |
+
self.parsed_cv['Contact Info'] = contact_info_dict
|
48 |
+
|
49 |
+
def find_person_name(self, items):
|
50 |
+
class_score = []
|
51 |
+
splitter = re.compile(r'[{}]+'.format(re.escape(punctuation.replace("&", "") )))
|
52 |
+
classes = ["person name", "address", "email", "title"]
|
53 |
+
for item in items:
|
54 |
+
elements = splitter.split(item)
|
55 |
+
for element in elements:
|
56 |
+
element = ''.join(i for i in element.strip() if not i.isdigit())
|
57 |
+
if not len(element.strip().split()) > 1: continue
|
58 |
+
out = self.zero_shot_classifier(element, classes)
|
59 |
+
highest = sorted(zip(out["labels"], out["scores"]), key=lambda x: x[1])[-1]
|
60 |
+
if highest[0] == "person name":
|
61 |
+
class_score.append((element, highest[1]))
|
62 |
+
if len(class_score):
|
63 |
+
return sorted(class_score, key=lambda x: x[1], reverse=True)[0][0]
|
64 |
+
return ""
|
65 |
+
|
66 |
+
def find_contact_email(self, items):
|
67 |
+
for item in items:
|
68 |
+
match = re.search(r'[\w.+-]+@[\w-]+\.[\w.-]+', item)
|
69 |
+
if match:
|
70 |
+
return match.group(0)
|
71 |
+
return ""
|
72 |
+
|
73 |
+
def parse_job_history(self, resume_segment):
|
74 |
+
idx_job_title = self.get_job_titles(resume_segment)
|
75 |
+
current_and_below = False
|
76 |
+
if not len(idx_job_title):
|
77 |
+
self.parsed_cv["Job History"] = []
|
78 |
+
return
|
79 |
+
if idx_job_title[0][0] == 0: current_and_below = True
|
80 |
+
job_history = []
|
81 |
+
for ls_idx, (idx, job_title) in enumerate(idx_job_title):
|
82 |
+
job_info = {}
|
83 |
+
# print("<br> Job Title: ",job_title)
|
84 |
+
job_info["Job Title"] = self.filter_job_title(job_title)
|
85 |
+
# company
|
86 |
+
if current_and_below: line1, line2 = idx, idx+1
|
87 |
+
else: line1, line2 = idx, idx-1
|
88 |
+
job_info["Company"] = self.get_job_company(line1, line2, resume_segment)
|
89 |
+
if current_and_below: st_span = idx
|
90 |
+
else: st_span = idx-1
|
91 |
+
# Dates
|
92 |
+
if ls_idx == len(idx_job_title) - 1: end_span = len(resume_segment)
|
93 |
+
else: end_span = idx_job_title[ls_idx+1][0]
|
94 |
+
start, end = self.get_job_dates(st_span, end_span, resume_segment)
|
95 |
+
job_info["Start Date"] = start
|
96 |
+
job_info["End Date"] = end
|
97 |
+
# if(start != "" and end != ""):
|
98 |
+
job_history.append(job_info)
|
99 |
+
self.parsed_cv["Job History"] = job_history
|
100 |
+
|
101 |
+
def get_job_titles(self, resume_segment):
|
102 |
+
classes = ["organization", "institution", "company", "job title", "work details"]
|
103 |
+
idx_line = []
|
104 |
+
for idx, line in enumerate(resume_segment):
|
105 |
+
has_verb = False
|
106 |
+
line_modifed = ''.join(i for i in line if not i.isdigit())
|
107 |
+
sentence = self.models.get_flair_sentence(line_modifed)
|
108 |
+
self.tagger.predict(sentence)
|
109 |
+
tags = []
|
110 |
+
for entity in sentence.get_spans('pos'):
|
111 |
+
tags.append(entity.tag)
|
112 |
+
if entity.tag.startswith("V"):
|
113 |
+
has_verb = True
|
114 |
+
|
115 |
+
most_common_tag = max(set(tags), key=tags.count)
|
116 |
+
if (most_common_tag == "NNP") or (most_common_tag == "NN"):
|
117 |
+
# if most_common_tag == "NNP":
|
118 |
+
if not has_verb:
|
119 |
+
out = self.zero_shot_classifier(line, classes)
|
120 |
+
class_score = zip(out["labels"], out["scores"])
|
121 |
+
highest = sorted(class_score, key=lambda x: x[1])[-1]
|
122 |
+
|
123 |
+
if (highest[0] == "job title") or (highest[0] == "organization"):
|
124 |
+
# if highest[0] == "job title":
|
125 |
+
idx_line.append((idx, line))
|
126 |
+
return idx_line
|
127 |
+
|
128 |
+
def get_job_dates(self, st, end, resume_segment):
|
129 |
+
search_span = resume_segment[st:end]
|
130 |
+
dates = []
|
131 |
+
for line in search_span:
|
132 |
+
for dt in self.get_ner_in_line(line, "DATE"):
|
133 |
+
if self.isvalidyear(dt.strip()):
|
134 |
+
dates.append(dt)
|
135 |
+
if len(dates): first = dates[0]
|
136 |
+
exists_second = False
|
137 |
+
if len(dates) > 1:
|
138 |
+
exists_second = True
|
139 |
+
second = dates[1]
|
140 |
+
|
141 |
+
if len(dates) > 0:
|
142 |
+
if self.has_two_dates(first):
|
143 |
+
d1, d2 = self.get_two_dates(first)
|
144 |
+
return self.format_date(d1), self.format_date(d2)
|
145 |
+
elif exists_second and self.has_two_dates(second):
|
146 |
+
d1, d2 = self.get_two_dates(second)
|
147 |
+
return self.format_date(d1), self.format_date(d2)
|
148 |
+
else:
|
149 |
+
if exists_second:
|
150 |
+
st = self.format_date(first)
|
151 |
+
end = self.format_date(second)
|
152 |
+
return st, end
|
153 |
+
else:
|
154 |
+
return (self.format_date(first), "")
|
155 |
+
else: return ("", "")
|
156 |
+
|
157 |
+
|
158 |
+
|
159 |
+
def filter_job_title(self, job_title):
|
160 |
+
job_title_splitter = re.compile(r'[{}]+'.format(re.escape(punctuation.replace("&", "") )))
|
161 |
+
job_title = ''.join(i for i in job_title if not i.isdigit())
|
162 |
+
tokens = job_title_splitter.split(job_title)
|
163 |
+
tokens = [''.join([i for i in tok.strip() if (i.isalpha() or i.strip()=="")]) for tok in tokens if tok.strip()]
|
164 |
+
classes = ["company", "organization", "institution", "job title", "responsibility", "details"]
|
165 |
+
new_title = []
|
166 |
+
for token in tokens:
|
167 |
+
if not token: continue
|
168 |
+
res = self.zero_shot_classifier(token, classes)
|
169 |
+
class_score = zip(res["labels"], res["scores"])
|
170 |
+
highest = sorted(class_score, key=lambda x: x[1])[-1]
|
171 |
+
if (highest[0] == "job title") or (highest[0] == "organization"):
|
172 |
+
# if highest[0] == "job title":
|
173 |
+
new_title.append(token.strip())
|
174 |
+
if len(new_title):
|
175 |
+
return ', '.join(new_title)
|
176 |
+
else: return ', '.join(tokens)
|
177 |
+
|
178 |
+
def has_two_dates(self, date):
|
179 |
+
years = self.get_valid_years()
|
180 |
+
count = 0
|
181 |
+
for year in years:
|
182 |
+
if year in str(date):
|
183 |
+
count+=1
|
184 |
+
return count == 2
|
185 |
+
|
186 |
+
def get_two_dates(self, date):
|
187 |
+
years = self.get_valid_years()
|
188 |
+
idxs = []
|
189 |
+
for year in years:
|
190 |
+
if year in date:
|
191 |
+
idxs.append(date.index(year))
|
192 |
+
min_idx = min(idxs)
|
193 |
+
first = date[:min_idx+4]
|
194 |
+
second = date[min_idx+4:]
|
195 |
+
return first, second
|
196 |
+
def get_valid_years(self):
|
197 |
+
current_year = datetime.today().year
|
198 |
+
years = [str(i) for i in range(current_year-100, current_year)]
|
199 |
+
return years
|
200 |
+
|
201 |
+
def format_date(self, date):
|
202 |
+
out = self.parse_date(date)
|
203 |
+
if out:
|
204 |
+
return out
|
205 |
+
else:
|
206 |
+
date = self.clean_date(date)
|
207 |
+
out = self.parse_date(date)
|
208 |
+
if out:
|
209 |
+
return out
|
210 |
+
else:
|
211 |
+
return date
|
212 |
+
|
213 |
+
def clean_date(self, date):
|
214 |
+
try:
|
215 |
+
date = ''.join(i for i in date if i.isalnum() or i =='-' or i == '/')
|
216 |
+
return date
|
217 |
+
except:
|
218 |
+
return date
|
219 |
+
|
220 |
+
def parse_date(self, date):
|
221 |
+
try:
|
222 |
+
date = parser.parse(date)
|
223 |
+
return date.strftime("%m-%Y")
|
224 |
+
except:
|
225 |
+
try:
|
226 |
+
date = datetime(date)
|
227 |
+
return date.strftime("%m-%Y")
|
228 |
+
except:
|
229 |
+
return 0
|
230 |
+
|
231 |
+
|
232 |
+
def isvalidyear(self, date):
|
233 |
+
current_year = datetime.today().year
|
234 |
+
years = [str(i) for i in range(current_year-100, current_year)]
|
235 |
+
for year in years:
|
236 |
+
if year in str(date):
|
237 |
+
return True
|
238 |
+
return False
|
239 |
+
|
240 |
+
def get_ner_in_line(self, line, entity_type):
|
241 |
+
if entity_type == "DATE": ner = self.ner_dates
|
242 |
+
else: ner = self.ner
|
243 |
+
return [i['word'] for i in ner(line) if i['entity_group'] == entity_type]
|
244 |
+
|
245 |
+
|
246 |
+
def get_job_company(self, idx, idx1, resume_segment):
|
247 |
+
job_title = resume_segment[idx]
|
248 |
+
if not idx1 <= len(resume_segment)-1: context = ""
|
249 |
+
else:context = resume_segment[idx1]
|
250 |
+
candidate_companies = self.get_ner_in_line(job_title, "ORG") + self.get_ner_in_line(context, "ORG")
|
251 |
+
classes = ["organization", "company", "institution", "not organization", "not company", "not institution"]
|
252 |
+
scores = []
|
253 |
+
for comp in candidate_companies:
|
254 |
+
res = self.zero_shot_classifier(comp, classes)['scores']
|
255 |
+
scores.append(max(res[:3]))
|
256 |
+
sorted_cmps = sorted(zip(candidate_companies, scores), key=lambda x: x[1], reverse=True)
|
257 |
+
if len(sorted_cmps): return sorted_cmps[0][0]
|
258 |
+
return context
|
ResumeReader.py
ADDED
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import os
|
3 |
+
import logging
|
4 |
+
import pdfplumber
|
5 |
+
import fitz
|
6 |
+
|
7 |
+
class ResumeReader:
|
8 |
+
|
9 |
+
def convert_docx_to_txt(self, docx_file,docx_parser):
|
10 |
+
"""
|
11 |
+
A utility function to convert a Microsoft docx files to raw text.
|
12 |
+
|
13 |
+
This code is largely borrowed from existing solutions, and does not match the style of the rest of this repo.
|
14 |
+
:param docx_file: docx file with gets uploaded by the user
|
15 |
+
:type docx_file: InMemoryUploadedFile
|
16 |
+
:return: The text contents of the docx file
|
17 |
+
:rtype: str
|
18 |
+
"""
|
19 |
+
|
20 |
+
# doc = docx.Document(docx_file)
|
21 |
+
# allText = []
|
22 |
+
# for docpara in doc.paragraphs:
|
23 |
+
# allText.append(docpara.text)
|
24 |
+
# text = ' '.join(allText)
|
25 |
+
text = ""
|
26 |
+
try:
|
27 |
+
clean_text = re.sub(r'\n+', '\n', text)
|
28 |
+
clean_text = clean_text.replace("\r", "\n").replace("\t", " ") # Normalize text blob
|
29 |
+
resume_lines = clean_text.splitlines() # Split text blob into individual lines
|
30 |
+
resume_lines = [re.sub('\s+', ' ', line.strip()) for line in resume_lines if
|
31 |
+
line.strip()] # Remove empty strings and whitespaces
|
32 |
+
return resume_lines, text
|
33 |
+
except Exception as e:
|
34 |
+
logging.error('Error in docx file:: ' + str(e))
|
35 |
+
return [], " "
|
36 |
+
|
37 |
+
def convert_pdf_to_txt(self, pdf_file):
|
38 |
+
"""
|
39 |
+
A utility function to convert a machine-readable PDF to raw text.
|
40 |
+
|
41 |
+
This code is largely borrowed from existing solutions, and does not match the style of the rest of this repo.
|
42 |
+
:param input_pdf_path: Path to the .pdf file which should be converted
|
43 |
+
:type input_pdf_path: str
|
44 |
+
:return: The text contents of the pdf
|
45 |
+
:rtype: str
|
46 |
+
"""
|
47 |
+
|
48 |
+
pdf = pdfplumber.open(pdf_file)
|
49 |
+
raw_text= ""
|
50 |
+
with fitz.open(pdf_file) as doc:
|
51 |
+
for page in doc:
|
52 |
+
raw_text += page.get_text()
|
53 |
+
print(raw_text)
|
54 |
+
# for page in pdf.pages:
|
55 |
+
# raw_text += page.extract_text() + "\n"
|
56 |
+
|
57 |
+
pdf.close()
|
58 |
+
|
59 |
+
try:
|
60 |
+
full_string = re.sub(r'\n+', '\n', raw_text)
|
61 |
+
full_string = full_string.replace("\r", "\n")
|
62 |
+
full_string = full_string.replace("\t", " ")
|
63 |
+
|
64 |
+
# Remove awkward LaTeX bullet characters
|
65 |
+
full_string = re.sub(r"\uf0b7", " ", full_string)
|
66 |
+
full_string = re.sub(r"\(cid:\d{0,3}\)", " ", full_string)
|
67 |
+
full_string = re.sub(r'• ', " ", full_string)
|
68 |
+
|
69 |
+
# Split text blob into individual lines
|
70 |
+
resume_lines = full_string.splitlines(True)
|
71 |
+
|
72 |
+
# Remove empty strings and whitespaces
|
73 |
+
resume_lines = [re.sub('\s+', ' ', line.strip()) for line in resume_lines if line.strip()]
|
74 |
+
|
75 |
+
return resume_lines, raw_text
|
76 |
+
except Exception as e:
|
77 |
+
logging.error('Error in docx file:: ' + str(e))
|
78 |
+
return [], " "
|
79 |
+
|
80 |
+
def read_file(self, file,docx_parser = "tika"):
|
81 |
+
"""
|
82 |
+
file : Give path of resume file
|
83 |
+
docx_parser : Enter docx2txt or tika, by default is tika
|
84 |
+
"""
|
85 |
+
print("Reading the Resume...")
|
86 |
+
# file = "/content/Asst Manager Trust Administration.docx"
|
87 |
+
file = os.path.join(file)
|
88 |
+
if file.endswith('docx') or file.endswith('doc'):
|
89 |
+
# if file.endswith('doc') and docx_parser == "docx2txt":
|
90 |
+
# docx_parser = "tika"
|
91 |
+
# logging.error("doc format not supported by the docx2txt changing back to tika")
|
92 |
+
resume_lines, raw_text = self.convert_docx_to_txt(file,docx_parser)
|
93 |
+
elif file.endswith('pdf'):
|
94 |
+
resume_lines, raw_text = self.convert_pdf_to_txt(file)
|
95 |
+
elif file.endswith('txt'):
|
96 |
+
with open(file, 'r', encoding='utf-8') as f:
|
97 |
+
resume_lines = f.readlines()
|
98 |
+
|
99 |
+
else:
|
100 |
+
resume_lines = None
|
101 |
+
|
102 |
+
|
103 |
+
return resume_lines
|
ResumeSegmenter.py
ADDED
@@ -0,0 +1,264 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from Models import Models
|
2 |
+
|
3 |
+
class ResumeSegmenter:
|
4 |
+
|
5 |
+
def __init__(self, zero_shot_classifier):
|
6 |
+
self.zero_shot_classifier = zero_shot_classifier
|
7 |
+
|
8 |
+
objective = (
|
9 |
+
'career goal',
|
10 |
+
'objective',
|
11 |
+
'career objective',
|
12 |
+
'employment objective',
|
13 |
+
'professional objective',
|
14 |
+
'summary',
|
15 |
+
'summary of qualifications',
|
16 |
+
'digital',
|
17 |
+
'interests'
|
18 |
+
)
|
19 |
+
|
20 |
+
work_and_employment = (
|
21 |
+
'employment history',
|
22 |
+
'employment data',
|
23 |
+
'career summary',
|
24 |
+
'work history',
|
25 |
+
'working history',
|
26 |
+
'work experience',
|
27 |
+
'experience',
|
28 |
+
'professional experience',
|
29 |
+
'professional background',
|
30 |
+
'professional employment',
|
31 |
+
'additional experience',
|
32 |
+
'career related experience',
|
33 |
+
"professional employment history",
|
34 |
+
'related experience',
|
35 |
+
'relevant experience',
|
36 |
+
'programming experience',
|
37 |
+
'freelance',
|
38 |
+
'freelance experience',
|
39 |
+
'army experience',
|
40 |
+
'military experience',
|
41 |
+
'military background',
|
42 |
+
)
|
43 |
+
|
44 |
+
education_and_training = (
|
45 |
+
'academic background',
|
46 |
+
'academic experience',
|
47 |
+
'programs',
|
48 |
+
'courses',
|
49 |
+
'related courses',
|
50 |
+
'education',
|
51 |
+
'educational background',
|
52 |
+
'educational qualifications',
|
53 |
+
'educational training',
|
54 |
+
'education and training',
|
55 |
+
'training',
|
56 |
+
'academic training',
|
57 |
+
'Academic Qualification',
|
58 |
+
'professional training',
|
59 |
+
'course project experience',
|
60 |
+
'related course projects',
|
61 |
+
'internship experience',
|
62 |
+
'internships',
|
63 |
+
'apprenticeships',
|
64 |
+
'college activities',
|
65 |
+
'certifications',
|
66 |
+
'special training',
|
67 |
+
)
|
68 |
+
|
69 |
+
skills_header = (
|
70 |
+
'credentials',
|
71 |
+
'qualifications',
|
72 |
+
'areas of experience',
|
73 |
+
'areas of expertise',
|
74 |
+
'areas of knowledge',
|
75 |
+
'skills',
|
76 |
+
'Skills',
|
77 |
+
"other skills",
|
78 |
+
"other abilities",
|
79 |
+
'career related skills',
|
80 |
+
'professional skills',
|
81 |
+
'specialized skills',
|
82 |
+
'technical skills',
|
83 |
+
'computer skills',
|
84 |
+
'personal skills',
|
85 |
+
'computer knowledge',
|
86 |
+
'technologies',
|
87 |
+
'technical experience',
|
88 |
+
'proficiencies',
|
89 |
+
'languages',
|
90 |
+
'language competencies and skills',
|
91 |
+
'programming languages',
|
92 |
+
'competencies'
|
93 |
+
)
|
94 |
+
|
95 |
+
misc = (
|
96 |
+
'activities and honors',
|
97 |
+
'activities',
|
98 |
+
'affiliations',
|
99 |
+
'professional affiliations',
|
100 |
+
'associations',
|
101 |
+
'professional associations',
|
102 |
+
'memberships',
|
103 |
+
'professional memberships',
|
104 |
+
'athletic involvement',
|
105 |
+
'community involvement',
|
106 |
+
'refere',
|
107 |
+
'civic activities',
|
108 |
+
'extra-Curricular activities',
|
109 |
+
'professional activities',
|
110 |
+
'volunteer work',
|
111 |
+
'volunteer experience',
|
112 |
+
'additional information',
|
113 |
+
'interests'
|
114 |
+
)
|
115 |
+
|
116 |
+
accomplishments = (
|
117 |
+
'achievement',
|
118 |
+
'awards and achievements',
|
119 |
+
'licenses',
|
120 |
+
'presentations',
|
121 |
+
'conference presentations',
|
122 |
+
'conventions',
|
123 |
+
'dissertations',
|
124 |
+
'exhibits',
|
125 |
+
'papers',
|
126 |
+
'publications',
|
127 |
+
'professional publications',
|
128 |
+
'research experience',
|
129 |
+
'research grants',
|
130 |
+
'project',
|
131 |
+
'research projects',
|
132 |
+
'personal projects',
|
133 |
+
'current research interests',
|
134 |
+
'thesis',
|
135 |
+
'theses',
|
136 |
+
)
|
137 |
+
|
138 |
+
|
139 |
+
def find_segment_indices(self, string_to_search, resume_segments, resume_indices):
|
140 |
+
for i, line in enumerate(string_to_search):
|
141 |
+
|
142 |
+
if line[0].islower():
|
143 |
+
continue
|
144 |
+
|
145 |
+
header = line.lower()
|
146 |
+
|
147 |
+
if [o for o in self.objective if header.startswith(o)]:
|
148 |
+
try:
|
149 |
+
resume_segments['objective'][header]
|
150 |
+
except:
|
151 |
+
resume_indices.append(i)
|
152 |
+
header = [o for o in self.objective if header.startswith(o)][0]
|
153 |
+
resume_segments['objective'][header] = i
|
154 |
+
elif [w for w in self.work_and_employment if header.startswith(w)]:
|
155 |
+
try:
|
156 |
+
resume_segments['work_and_employment'][header]
|
157 |
+
except:
|
158 |
+
resume_indices.append(i)
|
159 |
+
header = [w for w in self.work_and_employment if header.startswith(w)][0]
|
160 |
+
resume_segments['work_and_employment'][header] = i
|
161 |
+
elif [e for e in self.education_and_training if header.startswith(e)]:
|
162 |
+
try:
|
163 |
+
resume_segments['education_and_training'][header]
|
164 |
+
except:
|
165 |
+
resume_indices.append(i)
|
166 |
+
header = [e for e in self.education_and_training if header.startswith(e)][0]
|
167 |
+
resume_segments['education_and_training'][header] = i
|
168 |
+
elif [s for s in self.skills_header if header.startswith(s)]:
|
169 |
+
try:
|
170 |
+
resume_segments['skills'][header]
|
171 |
+
except:
|
172 |
+
resume_indices.append(i)
|
173 |
+
header = [s for s in self.skills_header if header.startswith(s)][0]
|
174 |
+
resume_segments['skills'][header] = i
|
175 |
+
elif [m for m in self.misc if header.startswith(m)]:
|
176 |
+
try:
|
177 |
+
resume_segments['misc'][header]
|
178 |
+
except:
|
179 |
+
resume_indices.append(i)
|
180 |
+
header = [m for m in self.misc if header.startswith(m)][0]
|
181 |
+
resume_segments['misc'][header] = i
|
182 |
+
elif [a for a in self.accomplishments if header.startswith(a)]:
|
183 |
+
try:
|
184 |
+
resume_segments['accomplishments'][header]
|
185 |
+
except:
|
186 |
+
resume_indices.append(i)
|
187 |
+
header = [a for a in self.accomplishments if header.startswith(a)][0]
|
188 |
+
resume_segments['accomplishments'][header] = i
|
189 |
+
|
190 |
+
def slice_segments(self, string_to_search, resume_segments, resume_indices):
|
191 |
+
resume_segments['contact_info'] = string_to_search[:resume_indices[0]]
|
192 |
+
sec_idxs = {}
|
193 |
+
for section, value in resume_segments.items():
|
194 |
+
if section == 'contact_info':
|
195 |
+
continue
|
196 |
+
|
197 |
+
for sub_section, start_idx in value.items():
|
198 |
+
end_idx = len(string_to_search)
|
199 |
+
if (resume_indices.index(start_idx) + 1) != len(resume_indices):
|
200 |
+
end_idx = resume_indices[resume_indices.index(start_idx) + 1]
|
201 |
+
|
202 |
+
sec_idxs[section] = (start_idx, end_idx)
|
203 |
+
# print(start_idx, end_idx)
|
204 |
+
|
205 |
+
resume_segments[section][sub_section] = string_to_search[start_idx:end_idx]
|
206 |
+
return sec_idxs
|
207 |
+
|
208 |
+
def find_true_segment(self, dict_of_segments, segment_name):
|
209 |
+
segment_classes = {
|
210 |
+
'objective': ["objective", "other"],
|
211 |
+
'work_and_employment':["employment history", "other"],
|
212 |
+
'education_and_training': ["education", "other"],
|
213 |
+
'skills': ["skills", "other"],
|
214 |
+
'accomplishments': ["accomplishments", "other"],
|
215 |
+
'misc': ["misc", "other"],
|
216 |
+
'contact_info': ["contact information", "other"]
|
217 |
+
}
|
218 |
+
classes = segment_classes[segment_name]
|
219 |
+
scores = []
|
220 |
+
segs = dict_of_segments.keys()
|
221 |
+
for seg in segs:
|
222 |
+
sequence = dict_of_segments[seg]
|
223 |
+
score = self.zero_shot_classifier(' '.join(sequence), classes)["scores"][0]
|
224 |
+
scores.append(score)
|
225 |
+
|
226 |
+
res = sorted(zip(dict_of_segments.keys(), scores), key=lambda x: x[1], reverse=True)
|
227 |
+
if len(res):
|
228 |
+
return res[0][0]
|
229 |
+
else: return 0
|
230 |
+
|
231 |
+
def segment(self, string_to_search):
|
232 |
+
print("Segmenting the Resume..")
|
233 |
+
resume_segments = {
|
234 |
+
'objective': {},
|
235 |
+
'work_and_employment': {},
|
236 |
+
'education_and_training': {},
|
237 |
+
'skills': {},
|
238 |
+
'accomplishments': {},
|
239 |
+
'misc': {}
|
240 |
+
}
|
241 |
+
|
242 |
+
resume_indices = []
|
243 |
+
|
244 |
+
self.find_segment_indices(string_to_search, resume_segments, resume_indices)
|
245 |
+
if len(resume_indices) != 0:
|
246 |
+
sec_idx = self.slice_segments(string_to_search, resume_segments, resume_indices)
|
247 |
+
else:
|
248 |
+
resume_segments['contact_info'] = []
|
249 |
+
|
250 |
+
for segment in resume_segments:
|
251 |
+
if segment == "contact_info": continue
|
252 |
+
if not len(resume_segments[segment]) > 1:
|
253 |
+
if len(resume_segments[segment]) == 1:
|
254 |
+
only_key = list(resume_segments[segment].keys())[0]
|
255 |
+
resume_segments[segment] = resume_segments[segment][only_key][1:]
|
256 |
+
continue
|
257 |
+
if segment != "work_and_employment": continue
|
258 |
+
true_seg = self.find_true_segment(resume_segments[segment], segment)
|
259 |
+
if not true_seg:
|
260 |
+
resume_segments[segment] = []
|
261 |
+
else:
|
262 |
+
resume_segments[segment] = resume_segments[segment][true_seg][1:]
|
263 |
+
|
264 |
+
return resume_segments
|
app.py
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pydoc import describe
|
2 |
+
import gradio as gr
|
3 |
+
from main import Main
|
4 |
+
|
5 |
+
|
6 |
+
main = Main()
|
7 |
+
|
8 |
+
def parse_cv(cv):
|
9 |
+
return main.parse_cv(cv.name)
|
10 |
+
|
11 |
+
|
12 |
+
description = """A demo for a CV parser."""
|
13 |
+
article = "Resume Parser by Sybghat"
|
14 |
+
file_input = gr.inputs.File(file_count="single", type="file", label="Upload a CV: .PDF Or .TXT", optional=False)
|
15 |
+
iface = gr.Interface(fn=parse_cv, inputs=file_input, outputs="json", allow_flagging="never",
|
16 |
+
allow_screenshot=False, title="CV Parser", theme="seafoam", description=description, article=article)
|
17 |
+
|
18 |
+
iface.launch()
|
main.py
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from ResumeReader import ResumeReader
|
2 |
+
from ResumeParser import ResumeParser
|
3 |
+
from Models import Models
|
4 |
+
import json
|
5 |
+
import os
|
6 |
+
|
7 |
+
|
8 |
+
class Main:
|
9 |
+
def __init__(self):
|
10 |
+
models = Models()
|
11 |
+
ner, ner_dates, zero_shot_classifier, tagger = models.load_trained_models()
|
12 |
+
self.reader = ResumeReader()
|
13 |
+
self.parser = ResumeParser(ner, ner_dates, zero_shot_classifier, tagger)
|
14 |
+
|
15 |
+
def parse_cv(self, file_path):
|
16 |
+
resume_lines = self.reader.read_file(file_path)
|
17 |
+
output = self.parser.parse(resume_lines)
|
18 |
+
return output
|
19 |
+
|
20 |
+
def save_parse_as_json(self, dict, file_name):
|
21 |
+
print("Saving the parse...")
|
22 |
+
with open(file_name, 'w', encoding="utf-8") as f:
|
23 |
+
json.dump(dict, f, indent=4, default=str, ensure_ascii=False)
|
readMe.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
Activate Virtual environment:
|
2 |
+
resume-parser/Scripts/activate.bat // CMD
|
3 |
+
resume-parser/Scripts/activate.ps1 //Powershell
|
requirements.txt
ADDED
@@ -0,0 +1,111 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
pip==22.3.1
|
2 |
+
aiohttp==3.8.1
|
3 |
+
aiosignal==1.2.0
|
4 |
+
analytics-python==1.4.0
|
5 |
+
anyio==3.5.0
|
6 |
+
asgiref==3.5.0
|
7 |
+
async-timeout==4.0.2
|
8 |
+
attrs==21.4.0
|
9 |
+
backoff==1.10.0
|
10 |
+
bcrypt==3.2.0
|
11 |
+
bpemb==0.3.3
|
12 |
+
certifi==2021.10.8
|
13 |
+
cffi==1.15.0
|
14 |
+
chardet==4.0.0
|
15 |
+
charset-normalizer==2.0.11
|
16 |
+
click==8.0.3
|
17 |
+
colorama==0.4.4
|
18 |
+
coloredlogs==15.0.1
|
19 |
+
conllu==4.4.1
|
20 |
+
cryptography==36.0.1
|
21 |
+
cycler==0.11.0
|
22 |
+
Cython==0.29.23
|
23 |
+
Deprecated==1.2.13
|
24 |
+
doc2text==0.2.4
|
25 |
+
fastapi==0.73.0
|
26 |
+
ffmpy==0.3.0
|
27 |
+
filelock==3.4.2
|
28 |
+
flair==0.10
|
29 |
+
flatbuffers==2.0
|
30 |
+
fonttools==4.29.1
|
31 |
+
frozenlist==1.3.0
|
32 |
+
ftfy==6.0.3
|
33 |
+
future==0.18.2
|
34 |
+
gdown==3.12.2
|
35 |
+
gensim==4.1.2
|
36 |
+
gradio==2.7.5.2
|
37 |
+
h11==0.13.0
|
38 |
+
huggingface-hub==0.4.0
|
39 |
+
humanfriendly==10.0
|
40 |
+
idna==3.3
|
41 |
+
importlib-metadata==3.10.1
|
42 |
+
Janome==0.4.1
|
43 |
+
Jinja2==3.0.3
|
44 |
+
joblib==1.1.0
|
45 |
+
kiwisolver==1.3.2
|
46 |
+
konoha==4.6.5
|
47 |
+
langdetect==1.0.9
|
48 |
+
markdown2==2.4.2
|
49 |
+
MarkupSafe==2.0.1
|
50 |
+
matplotlib==3.5.1
|
51 |
+
mime==0.1.0
|
52 |
+
monotonic==1.6
|
53 |
+
more-itertools==8.8.0
|
54 |
+
mpld3==0.3
|
55 |
+
multidict==6.0.2
|
56 |
+
numpy==1.22.1
|
57 |
+
overrides==3.1.0
|
58 |
+
packaging==21.3
|
59 |
+
pandas==1.4.0
|
60 |
+
paramiko==2.9.2
|
61 |
+
pdfminer.six==20211012
|
62 |
+
pdfplumber==0.6.0
|
63 |
+
Pillow==9.0.1
|
64 |
+
protobuf==3.19.4
|
65 |
+
psutil==5.9.0
|
66 |
+
py-cpuinfo==8.0.0
|
67 |
+
py3nvml==0.2.7
|
68 |
+
pycparser==2.21
|
69 |
+
pycryptodome==3.14.1
|
70 |
+
pydantic==1.9.0
|
71 |
+
pydub==0.25.1
|
72 |
+
PyNaCl==1.5.0
|
73 |
+
pyparsing==3.0.7
|
74 |
+
PyPDF2==1.26.0
|
75 |
+
pyreadline3==3.4.1
|
76 |
+
PySocks==1.7.1
|
77 |
+
pytesseract==0.3.8
|
78 |
+
python-dateutil==2.8.2
|
79 |
+
python-multipart==0.0.5
|
80 |
+
pytz==2021.3
|
81 |
+
PyYAML==6.0
|
82 |
+
regex==2022.1.18
|
83 |
+
requests==2.27.1
|
84 |
+
sacremoses==0.0.47
|
85 |
+
scikit-learn==1.0.2
|
86 |
+
scipy==1.7.3
|
87 |
+
segtok==1.5.11
|
88 |
+
sentencepiece==0.1.95
|
89 |
+
six==1.16.0
|
90 |
+
smart-open==5.2.1
|
91 |
+
sniffio==1.2.0
|
92 |
+
sqlitedict==1.7.0
|
93 |
+
starlette==0.17.1
|
94 |
+
tabulate==0.8.9
|
95 |
+
threadpoolctl==3.1.0
|
96 |
+
tokenizers==0.10.3
|
97 |
+
torch==1.10.2
|
98 |
+
tqdm==4.62.3
|
99 |
+
transformers==4.15.0
|
100 |
+
typing_extensions==4.0.1
|
101 |
+
urllib3==1.26.8
|
102 |
+
uvicorn==0.17.4
|
103 |
+
Wand==0.6.7
|
104 |
+
wcwidth==0.2.5
|
105 |
+
Wikipedia-API==0.5.4
|
106 |
+
wincertstore==0.2
|
107 |
+
wrapt==1.13.3
|
108 |
+
xmltodict==0.12.0
|
109 |
+
yarl==1.7.2
|
110 |
+
zipp==3.7.0
|
111 |
+
PyMuPDF==1.19.0
|