Spaces:
Sleeping
Sleeping
Roaoch
commited on
Commit
·
feeb971
1
Parent(s):
7e97035
From Deprecated
Browse files- .github/workflows/main.yml +18 -0
- Dockerfile +16 -0
- main.py +22 -0
- requirements.txt +0 -0
- src/cyberclaasic.py +71 -0
- src/discriminator.py +28 -0
- src/utils/proccess_data.py +14 -0
- startings.csv +0 -0
.github/workflows/main.yml
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: Sync to Hugging Face hub
|
2 |
+
on:
|
3 |
+
push:
|
4 |
+
branches: [main]
|
5 |
+
workflow_dispatch:
|
6 |
+
|
7 |
+
jobs:
|
8 |
+
sync-to-hub:
|
9 |
+
runs-on: ubuntu-latest
|
10 |
+
steps:
|
11 |
+
- uses: actions/checkout@v3
|
12 |
+
with:
|
13 |
+
fetch-depth: 0
|
14 |
+
lfs: true
|
15 |
+
- name: Push to hub
|
16 |
+
env:
|
17 |
+
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
18 |
+
run: git push --force https://Roaoch:[email protected]/spaces/Roaoch/CyberClassic main
|
Dockerfile
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.9
|
2 |
+
|
3 |
+
RUN useradd -m -u 1000 user
|
4 |
+
|
5 |
+
WORKDIR /app
|
6 |
+
|
7 |
+
COPY ./startings.csv ./startings.csv
|
8 |
+
COPY ./src ./src
|
9 |
+
COPY ./requirements.txt ./requirements.txt
|
10 |
+
COPY ./main.py ./main.py
|
11 |
+
|
12 |
+
RUN pip install --upgrade pip
|
13 |
+
RUN pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
|
14 |
+
RUN pip install pandas numpy transformers fastapi unicorn[standard]
|
15 |
+
|
16 |
+
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
|
main.py
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import warnings
|
2 |
+
|
3 |
+
from src.cyberclaasic import CyberClassic
|
4 |
+
from fastapi import FastAPI
|
5 |
+
|
6 |
+
warnings.simplefilter("ignore", UserWarning)
|
7 |
+
|
8 |
+
app = FastAPI()
|
9 |
+
|
10 |
+
text_generator = CyberClassic(
|
11 |
+
min_length=30,
|
12 |
+
max_length=50,
|
13 |
+
startings_path='./startings.csv'
|
14 |
+
)
|
15 |
+
|
16 |
+
@app.get("/")
|
17 |
+
def generete():
|
18 |
+
return {"text": str(text_generator.generate())}
|
19 |
+
|
20 |
+
@app.get('/answer')
|
21 |
+
def answer(promt: str):
|
22 |
+
return {"text": str(text_generator.answer(f'{promt}:\n'))}
|
requirements.txt
ADDED
File without changes
|
src/cyberclaasic.py
ADDED
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import uuid
|
2 |
+
import torch
|
3 |
+
import json
|
4 |
+
|
5 |
+
import pandas as pd
|
6 |
+
|
7 |
+
from src.discriminator import DiscriminatorModel
|
8 |
+
|
9 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM, GPT2LMHeadModel, GenerationConfig
|
10 |
+
|
11 |
+
import numpy as np
|
12 |
+
|
13 |
+
class CyberClassic(torch.nn.Module):
|
14 |
+
def __init__(
|
15 |
+
self,
|
16 |
+
min_length: int,
|
17 |
+
max_length: int,
|
18 |
+
startings_path: str
|
19 |
+
) -> None:
|
20 |
+
super().__init__()
|
21 |
+
self.min_length = min_length
|
22 |
+
self.max_length = max_length
|
23 |
+
self.startings = pd.read_csv(startings_path)
|
24 |
+
|
25 |
+
self.tokenizer = AutoTokenizer.from_pretrained('Roaoch/CyberClassic-Generator')
|
26 |
+
self.generator: GPT2LMHeadModel = AutoModelForCausalLM.from_pretrained('Roaoch/CyberClassic-Generator')
|
27 |
+
self.discriminator = DiscriminatorModel.from_pretrained('Roaoch/CyberClassic-Discriminator')
|
28 |
+
|
29 |
+
self.tokenizer.pad_token = self.tokenizer.eos_token
|
30 |
+
self.generation_config = GenerationConfig(
|
31 |
+
max_new_tokens=max_length,
|
32 |
+
num_beams=6,
|
33 |
+
early_stopping=True,
|
34 |
+
do_sample=True,
|
35 |
+
# top_k=60,
|
36 |
+
# penalty_alpha=0.6,
|
37 |
+
# top_p=0.95,
|
38 |
+
eos_token_id=self.tokenizer.eos_token_id,
|
39 |
+
pad_token=self.tokenizer.pad_token_id
|
40 |
+
)
|
41 |
+
|
42 |
+
def encode(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
|
43 |
+
last_hidden_state = self.generator(input_ids=input_ids, attention_mask=attention_mask, output_hidden_states=True)['hidden_states'][-1]
|
44 |
+
weights_for_non_padding = attention_mask * torch.arange(start=1, end=last_hidden_state.shape[1] + 1).unsqueeze(0)
|
45 |
+
sum_embeddings = torch.sum(last_hidden_state * weights_for_non_padding.unsqueeze(-1), dim=1)
|
46 |
+
num_of_none_padding_tokens = torch.sum(weights_for_non_padding, dim=-1).unsqueeze(-1)
|
47 |
+
return sum_embeddings / num_of_none_padding_tokens
|
48 |
+
|
49 |
+
def generate(self) -> str:
|
50 |
+
starts = self.startings['text'].values[np.random.randint(0, len(self.startings), 4)].tolist()
|
51 |
+
tokens = self.tokenizer(starts, return_tensors='pt', padding=True, truncation=True)
|
52 |
+
generated = self.generator.generate(**tokens, generation_config=self.generation_config)
|
53 |
+
|
54 |
+
input_emb = self.encode(input_ids=generated, attention_mask=torch.full(generated.size(), 1))
|
55 |
+
score = self.discriminator(input_emb)
|
56 |
+
score = torch.abs(score - 0.889)
|
57 |
+
index = int(torch.argmin(score))
|
58 |
+
|
59 |
+
decoded = self.tokenizer.batch_decode(generated, skip_special_tokens=True)
|
60 |
+
|
61 |
+
return decoded[index]
|
62 |
+
|
63 |
+
def answer(self, promt: str) -> str:
|
64 |
+
promt_tokens = self.tokenizer(promt, return_tensors='pt')
|
65 |
+
output = self.generator.generate(
|
66 |
+
**promt_tokens,
|
67 |
+
generation_config=self.generation_config,
|
68 |
+
)
|
69 |
+
|
70 |
+
decoded = self.tokenizer.batch_decode(output)
|
71 |
+
return decoded[0]
|
src/discriminator.py
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
|
3 |
+
from transformers import PretrainedConfig, PreTrainedModel
|
4 |
+
|
5 |
+
class DiscriminatorModelConfig(PretrainedConfig):
|
6 |
+
model_type = 'descriminatormodel'
|
7 |
+
def __init__(self, **kwargs):
|
8 |
+
super().__init__(**kwargs)
|
9 |
+
|
10 |
+
|
11 |
+
class DiscriminatorModel(PreTrainedModel):
|
12 |
+
config_class = DiscriminatorModelConfig
|
13 |
+
def __init__(self, config):
|
14 |
+
super().__init__(config)
|
15 |
+
self.config = config
|
16 |
+
self.model = torch.nn.Sequential(
|
17 |
+
torch.nn.Linear(768, 512),
|
18 |
+
torch.nn.ReLU(),
|
19 |
+
torch.nn.Dropout(0.1),
|
20 |
+
torch.nn.Linear(512, 256),
|
21 |
+
torch.nn.ReLU(),
|
22 |
+
torch.nn.Dropout(0.1),
|
23 |
+
torch.nn.Linear(256, 1),
|
24 |
+
torch.nn.Dropout(0.1),
|
25 |
+
torch.nn.Sigmoid()
|
26 |
+
)
|
27 |
+
def forward(self, input):
|
28 |
+
return self.model(input)
|
src/utils/proccess_data.py
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
|
3 |
+
df = pd.read_csv('dataset.csv')['text'].values
|
4 |
+
res = [
|
5 |
+
' '.join(txt.split(' ')[:3])
|
6 |
+
for txt in df
|
7 |
+
]
|
8 |
+
|
9 |
+
res_df = pd.DataFrame({
|
10 |
+
'text':
|
11 |
+
res
|
12 |
+
})
|
13 |
+
|
14 |
+
res_df.to_csv('startings.csv', index=False)
|
startings.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|