Spaces:

khrek
/

detailed-resume-parser

Running

App Files Files Community

khrek commited on Nov 30, 2023

Commit

822daa6

1 Parent(s): 547a2b6

Upload models.py

Browse files

Files changed (1) hide show

models.py +45 -41

models.py CHANGED Viewed

@@ -1,49 +1,53 @@
 import torch
 import sentencepiece
-from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
-from langchain import PromptTemplate,  LLMChain, HuggingFacePipeline
-import ast
 class Models():
     def __init__(self) -> None:
-        self.template = """
-              A virtual assistant answers questions from a user based on the provided text.
-              USER: Text: {input_text}
-              ASSISTANT: I’ve read this text.
-              USER: What describes {entity_type} in the text?
-              ASSISTANT:
-            """
         self.load_trained_models()
     def load_trained_models(self):
-        #is it best to keep in memory why not pickle?
-        checkpoint = "Universal-NER/UniNER-7B-all"
-        ner_model = AutoModelForCausalLM.from_pretrained(checkpoint, device_map="auto", torch_dtype=torch.float32, offload_folder="offload", offload_state_dict = True)
-        tokenizer = AutoTokenizer.from_pretrained("Universal-NER/UniNER-7B-all", use_fast=False, padding="max_length")
-        hf_pipeline = pipeline(
-            "text-generation", #task
-            model=ner_model,
-            max_length=1000,
-            tokenizer=tokenizer,
-            trust_remote_code=True,
-            do_sample=True,
-            top_k=10,
-            num_return_sequences=1
-        )
-        self.llm = HuggingFacePipeline(pipeline = hf_pipeline, model_kwargs = {'temperature':0})
-        self.prompt = PromptTemplate(template=self.template, input_variables=["input_text","entity_type"])
-        self.llm_chain = LLMChain(prompt=self.prompt, llm=self.llm)
-    def extract_ner(self, context, entity_type):
-        return ast.literal_eval(self.llm_chain.run({"input_text":context,"entity_type":entity_type}))
-    def get_ner(self, clean_lines, entity):
-        tokens = []
-        try_num = 0
-        while try_num < 5 and tokens == []:
-            tokens = self.extract_ner(' '.join(clean_lines), entity)
-        if len(tokens) == 0:
-            raise ValueError("Couldnt extract {entity}")
-        return tokens

 import torch
 import sentencepiece
+from transformers import AutoModelForTokenClassification, AutoTokenizer, pipeline
+import os
+import spacy
+import spacy_transformers
+import zipfile
+from collections import defaultdict
 class Models():
     def __init__(self) -> None:
         self.load_trained_models()
     def load_trained_models(self):
+        tokenizer = AutoTokenizer.from_pretrained("Jean-Baptiste/camembert-ner-with-dates",use_fast=False)
+        model = AutoModelForTokenClassification.from_pretrained("Jean-Baptiste/camembert-ner-with-dates")
+        self.ner = pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy="simple")
+        current_directory = os.path.dirname(os.path.realpath(__file__))
+        custom_ner_path = os.path.join(current_directory, 'spacy_model_v2/output/model-best')
+        if not os.path.exists(custom_ner_path):
+            with zipfile.ZipFile(r"models\prototype\spacy_model_v2.zip", 'r') as zip_ref:
+                # Extract all contents in the current working directory
+                zip_ref.extractall()
+        self.custom_ner = spacy.load(custom_ner_path)
+    def extract_ner(self, text):
+        entities = self.ner(text)
+        keys = ['DATE', 'ORG', 'LOC']
+        sort_dict = defaultdict(list)
+        for entity in entities:
+            if entity['score'] > 0.75:
+                sort_dict[entity['entity_group']].append(entity['word'])
+        filtered_dict = {key: value for key, value in sort_dict.items() if key in keys}
+        filtered_dict = defaultdict(list, filtered_dict)
+        return filtered_dict['DATE'], filtered_dict['ORG'], filtered_dict['LOC']
+    def get_ner(self, text, recover_text):
+        dates, companies, locations = self.extract_ner(text)
+        alternative_dates, alternative_companies, alternative_locations = self.extract_ner(recover_text)
+        if dates == [] :
+            dates = alternative_dates
+        if companies == []:
+            companies = alternative_companies
+        if locations == []:
+            locations = alternative_locations
+        return dates, companies, locations
+    def get_custom_ner(self, text):
+        doc = self.custom_ner(text)
+        entities = list(doc.ents)
+        sort_dict = defaultdict(list)
+        for entity in entities:
+            sort_dict[entity.label_].append(entity.text)
+        return sort_dict