Spaces:

qgyd2021
/

nlp_tools

Sleeping

App Files Files Community

qgyd2021 commited on Jan 11, 2024

Commit

1a4c139

1 Parent(s): a38ecb0

add language

Browse files

Files changed (13) hide show

.gitignore +2 -0
Dockerfile +2 -0
main.py +34 -3
toolbox/allennlp/__init__.py +6 -0
toolbox/allennlp/data/__init__.py +6 -0
toolbox/allennlp/data/dataset_readers/__init__.py +6 -0
toolbox/allennlp/data/dataset_readers/text_classification_json.py +95 -0
toolbox/allennlp/training/__init__.py +6 -0
toolbox/allennlp/training/optimizers.py +31 -0
toolbox/os/__init__.py +6 -0
toolbox/os/command.py +53 -0
toolbox/os/environment.py +114 -0
toolbox/os/other.py +9 -0

.gitignore CHANGED Viewed

@@ -3,3 +3,5 @@
 .idea/
 **/__pycache__/

 .idea/
 **/__pycache__/
+trained_models/

Dockerfile CHANGED Viewed

@@ -17,6 +17,8 @@ RUN useradd -m -u 1000 user
 # Switch to the "user" user
 USER user
 # Set home to the user's home directory
 ENV HOME=/home/user \
     PATH=/home/user/.local/bin:$PATH

 # Switch to the "user" user
 USER user
+RUN apt-get install -y git
 # Set home to the user's home directory
 ENV HOME=/home/user \
     PATH=/home/user/.local/bin:$PATH

main.py CHANGED Viewed

@@ -2,9 +2,15 @@
 # -*- coding: utf-8 -*-
 import argparse
 import gradio as gr
 import platform
 def get_args():
     parser = argparse.ArgumentParser()
@@ -20,10 +26,35 @@ model_names = {
 }
 def click_button_allennlp_text_classification(text: str, model_name: str):
-    print(text)
-    print(model_name)
-    return "label", 0.0
 def main():

 # -*- coding: utf-8 -*-
 import argparse
+from allennlp.models.archival import archive_model, load_archive
+from allennlp.predictors.text_classifier import TextClassifierPredictor
 import gradio as gr
 import platform
+from project_settings import project_path
+from toolbox.allennlp.data.dataset_readers.text_classification_json import TextClassificationJsonReader
+from toolbox.os.command import Command
 def get_args():
     parser = argparse.ArgumentParser()
 }
+trained_model_dir = project_path / "trained_models/huggingface"
+trained_model_dir.mkdir(parents=True, exist_ok=True)
 def click_button_allennlp_text_classification(text: str, model_name: str):
+    model_path = trained_model_dir / model_name
+    if not model_path.exists():
+        model_path.parent.mkdir(exist_ok=True)
+        Command.cd(model_path.parent.as_posix())
+        Command.popen("git clone https://huggingface.co/{}".format(model_name))
+    archive = load_archive(archive_file=model_path.as_posix())
+    predictor = TextClassifierPredictor(
+        model=archive.model,
+        dataset_reader=archive.dataset_reader,
+    )
+    json_dict = {
+        "sentence": text
+    }
+    outputs = predictor.predict_json(
+        json_dict
+    )
+    label = outputs["label"]
+    probs = outputs["probs"]
+    return label, round(max(probs), 4)
 def main():

toolbox/allennlp/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+if __name__ == '__main__':
+    pass

toolbox/allennlp/data/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+if __name__ == '__main__':
+    pass

toolbox/allennlp/data/dataset_readers/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+if __name__ == '__main__':
+    pass

toolbox/allennlp/data/dataset_readers/text_classification_json.py ADDED Viewed

	@@ -0,0 +1,95 @@

+from typing import Dict, List, Union
+import logging
+import json
+from allennlp.common.file_utils import cached_path
+from allennlp.data.dataset_readers.dataset_reader import DatasetReader
+from allennlp.data.fields import LabelField, TextField, Field, ListField
+from allennlp.data.instance import Instance
+from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
+from allennlp.data.tokenizers import Tokenizer, SpacyTokenizer
+from allennlp.data.tokenizers.sentence_splitter import SpacySentenceSplitter
+logger = logging.getLogger(__name__)
+@DatasetReader.register("text_classification_json_utf8")
+class TextClassificationJsonReader(DatasetReader):
+    def __init__(
+        self,
+        token_indexers: Dict[str, TokenIndexer] = None,
+        tokenizer: Tokenizer = None,
+        segment_sentences: bool = False,
+        max_sequence_length: int = None,
+        skip_label_indexing: bool = False,
+        text_key: str = "text",
+        label_key: str = "label",
+        **kwargs,
+    ) -> None:
+        super().__init__(
+            manual_distributed_sharding=True, manual_multiprocess_sharding=True, **kwargs
+        )
+        self._tokenizer = tokenizer or SpacyTokenizer()
+        self._segment_sentences = segment_sentences
+        self._max_sequence_length = max_sequence_length
+        self._skip_label_indexing = skip_label_indexing
+        self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}
+        self._text_key = text_key
+        self._label_key = label_key
+        if self._segment_sentences:
+            self._sentence_segmenter = SpacySentenceSplitter()
+    def _read(self, file_path):
+        with open(cached_path(file_path), "r", encoding="utf-8") as data_file:
+            for line in self.shard_iterable(data_file.readlines()):
+                if not line:
+                    continue
+                items = json.loads(line)
+                text = items[self._text_key]
+                label = items.get(self._label_key)
+                if label is not None:
+                    if self._skip_label_indexing:
+                        try:
+                            label = int(label)
+                        except ValueError:
+                            raise ValueError(
+                                "Labels must be integers if skip_label_indexing is True."
+                            )
+                    else:
+                        label = str(label)
+                yield self.text_to_instance(text=text, label=label)
+    def _truncate(self, tokens):
+        if len(tokens) > self._max_sequence_length:
+            tokens = tokens[: self._max_sequence_length]
+        return tokens
+    def text_to_instance(  # type: ignore
+        self, text: str, label: Union[str, int] = None
+    ) -> Instance:
+        fields: Dict[str, Field] = {}
+        if self._segment_sentences:
+            sentences: List[Field] = []
+            sentence_splits = self._sentence_segmenter.split_sentences(text)
+            for sentence in sentence_splits:
+                word_tokens = self._tokenizer.tokenize(sentence)
+                if self._max_sequence_length is not None:
+                    word_tokens = self._truncate(word_tokens)
+                sentences.append(TextField(word_tokens))
+            fields["tokens"] = ListField(sentences)
+        else:
+            tokens = self._tokenizer.tokenize(text)
+            if self._max_sequence_length is not None:
+                tokens = self._truncate(tokens)
+            fields["tokens"] = TextField(tokens)
+        if label is not None:
+            fields["label"] = LabelField(label, skip_indexing=self._skip_label_indexing)
+        return Instance(fields)
+    def apply_token_indexers(self, instance: Instance) -> None:
+        if self._segment_sentences:
+            for text_field in instance.fields["tokens"]:  # type: ignore
+                text_field._token_indexers = self._token_indexers
+        else:
+            instance.fields["tokens"]._token_indexers = self._token_indexers  # type: ignore

toolbox/allennlp/training/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+if __name__ == '__main__':
+    pass

toolbox/allennlp/training/optimizers.py ADDED Viewed

	@@ -0,0 +1,31 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+from typing import Any, Dict, List, Tuple
+from allennlp.training.optimizers import Registrable, Optimizer, make_parameter_groups
+from pytorch_pretrained_bert.optimization import BertAdam
+import torch
+@Optimizer.register("bert_adam")
+class BertAdamOptimizer(Optimizer, BertAdam):
+    def __init__(
+        self,
+        model_parameters: List[Tuple[str, torch.nn.Parameter]],
+        parameter_groups: List[Tuple[List[str], Dict[str, Any]]] = None,
+        lr: float = 5e-5,
+        warmup: float = 0.1,
+        t_total: int = 50000,
+        schedule: str = 'warmup_linear',
+    ):
+        super().__init__(
+            params=make_parameter_groups(model_parameters, parameter_groups),
+            lr=lr,
+            warmup=warmup,
+            t_total=t_total,
+            schedule=schedule,
+        )
+if __name__ == '__main__':
+    pass

toolbox/os/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+if __name__ == '__main__':
+    pass

toolbox/os/command.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import os
+class Command(object):
+    custom_command = [
+        'cd'
+    ]
+    @staticmethod
+    def _get_cmd(command):
+        command = str(command).strip()
+        if command == '':
+            return None
+        cmd_and_args = command.split(sep=' ')
+        cmd = cmd_and_args[0]
+        args = ' '.join(cmd_and_args[1:])
+        return cmd, args
+    @classmethod
+    def popen(cls, command):
+        cmd, args = cls._get_cmd(command)
+        if cmd in cls.custom_command:
+            method = getattr(cls, cmd)
+            return method(args)
+        else:
+            resp = os.popen(command)
+            result = resp.read()
+            resp.close()
+            return result
+    @classmethod
+    def cd(cls, args):
+        if args.startswith('/'):
+            os.chdir(args)
+        else:
+            pwd = os.getcwd()
+            path = os.path.join(pwd, args)
+            os.chdir(path)
+    @classmethod
+    def system(cls, command):
+        return os.system(command)
+    def __init__(self):
+        pass
+def ps_ef_grep(keyword: str):
+    cmd = 'ps -ef | grep {}'.format(keyword)
+    rows = Command.popen(cmd)
+    rows = str(rows).split('\n')
+    rows = [row for row in rows if row.__contains__(keyword) and not row.__contains__('grep')]
+    return rows

toolbox/os/environment.py ADDED Viewed

	@@ -0,0 +1,114 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+import json
+import os
+from dotenv import load_dotenv
+from dotenv.main import DotEnv
+from smart.json.misc import traverse
+class EnvironmentManager(object):
+    def __init__(self, path, env, override=False):
+        filename = os.path.join(path, '{}.env'.format(env))
+        self.filename = filename
+        load_dotenv(
+            dotenv_path=filename,
+            override=override
+        )
+        self._environ = dict()
+    def open_dotenv(self, filename: str = None):
+        filename = filename or self.filename
+        dotenv = DotEnv(
+            dotenv_path=filename,
+            stream=None,
+            verbose=False,
+            interpolate=False,
+            override=False,
+            encoding="utf-8",
+        )
+        result = dotenv.dict()
+        return result
+    def get(self, key, default=None, dtype=str):
+        result = os.environ.get(key)
+        if result is None:
+            if default is None:
+                result = None
+            else:
+                result = default
+        else:
+            result = dtype(result)
+        self._environ[key] = result
+        return result
+_DEFAULT_DTYPE_MAP = {
+    'int': int,
+    'float': float,
+    'str': str,
+    'json.loads': json.loads
+}
+class JsonConfig(object):
+    """
+    将 json 中, 形如 `$float:threshold` 的值, 处理为:
+    从环境变量中查到 threshold, 再将其转换为 float 类型.
+    """
+    def __init__(self, dtype_map: dict = None, environment: EnvironmentManager = None):
+        self.dtype_map = dtype_map or _DEFAULT_DTYPE_MAP
+        self.environment = environment or os.environ
+    def sanitize_by_filename(self, filename: str):
+        with open(filename, 'r', encoding='utf-8') as f:
+            js = json.load(f)
+        return self.sanitize_by_json(js)
+    def sanitize_by_json(self, js):
+        js = traverse(
+            js,
+            callback=self.sanitize,
+            environment=self.environment
+        )
+        return js
+    def sanitize(self, string, environment):
+        """支持 $ 符开始的, 环境变量配置"""
+        if isinstance(string, str) and string.startswith('$'):
+            dtype, key = string[1:].split(':')
+            dtype = self.dtype_map[dtype]
+            value = environment.get(key)
+            if value is None:
+                raise AssertionError('environment not exist. key: {}'.format(key))
+            value = dtype(value)
+            result = value
+        else:
+            result = string
+        return result
+def demo1():
+    import json
+    from project_settings import project_path
+    environment = EnvironmentManager(
+        path=os.path.join(project_path, 'server/callbot_server/dotenv'),
+        env='dev',
+    )
+    init_scenes = environment.get(key='init_scenes', dtype=json.loads)
+    print(init_scenes)
+    print(environment._environ)
+    return
+if __name__ == '__main__':
+    demo1()

toolbox/os/other.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import os
+import inspect
+def pwd():
+    """你在哪个文件调用此函数, 它就会返回那个文件所在的 dir 目标"""
+    frame = inspect.stack()[1]
+    module = inspect.getmodule(frame[0])
+    return os.path.dirname(os.path.abspath(module.__file__))