File size: 4,127 Bytes
078999d
24a982b
078999d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42f072b
9d1ac35
078999d
 
 
 
 
 
 
 
 
 
a54c1ef
078999d
 
 
 
 
 
 
 
 
 
 
 
a54c1ef
 
078999d
a54c1ef
 
 
078999d
 
 
 
 
 
 
 
 
 
 
 
42f072b
078999d
 
 
 
 
 
 
 
24a982b
078999d
 
 
 
24a982b
078999d
 
 
 
 
 
 
 
 
 
 
24a982b
078999d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import gc
import torch
from nltk import sent_tokenize
import nltk
from tqdm import tqdm
import gradio as gr
from peft import PeftModel
from transformers import T5ForConditionalGeneration, T5Tokenizer

nltk.download("punkt")
# autodetect the available device
GPU_IDX = 1  # which GPU to use
if torch.cuda.is_available():
    num_gpus = torch.cuda.device_count()
    print(f"Number of available GPUs: {num_gpus}")
    assert GPU_IDX < num_gpus, f"GPU index {GPU_IDX} not available."
    device = torch.device(f"cuda:{GPU_IDX}")
    print(f"Using GPU: {GPU_IDX}")
else:
    print("CUDA is not available. Using CPU instead.")
    device = torch.device("cpu")

batch_size = 64

# Configuration for models and their adapters
model_config = {
    "Base Model": "polygraf-ai/poly-humanizer-base",
    "Large Model": "polygraf-ai/poly-humanizer-large",
    "XL Model": "polygraf-ai/poly-humanizer-XL-adapter",
}

# cache the base models, tokenizers, and adapters
# initialize model and tokenizer
models, tokenizers = {}, {}
for name, path in model_config.items():
    if name == "XL Model":
        model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-xl", torch_dtype=torch.bfloat16).to(device)
        model = PeftModel.from_pretrained(model, path, torch_dtype=torch.bfloat16, is_trainable=False)
        model = model.merge_and_unload()
        models[name] = model
        tokenizers[name] = T5Tokenizer.from_pretrained("google/flan-t5-xl")
    else:
        model = T5ForConditionalGeneration.from_pretrained(path, torch_dtype=torch.bfloat16).to(device)
        models[name] = model
        tokenizers[name] = T5Tokenizer.from_pretrained(path)
    print(f"Loaded model: {name}, Num. params: {model.num_parameters()}")


def paraphrase_sentences(model, tokenizer, sentences, temperature, repetition_penalty, top_k, length_penalty):
    inputs = ["Please paraphrase this sentence: " + sentence for sentence in sentences]
    inputs = tokenizer(inputs, return_tensors="pt", padding=True, truncation=True).to(model.device)
    outputs = model.generate(
        **inputs,
        do_sample=True,
        temperature=temperature,
        repetition_penalty=repetition_penalty,
        max_length=128,
        top_k=top_k,
        length_penalty=length_penalty,
    )
    answers = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
    return answers


def paraphrase_text(
    text,
    progress=gr.Progress(),
    model_name="Base Model",
    temperature=1.2,
    repetition_penalty=1.0,
    top_k=50,
    length_penalty=1.0,
):
    """
    Optimization here is to feed all sentences at once to the model.
    Paragraphs are stored as a number of sentences per paragraph.
    """
    progress(0, desc="Starting to Humanize")
    # Select the model, tokenizer, and adapter
    tokenizer = tokenizers[model_name]
    model = models[model_name].to(device)

    # Split the text into paragraphs and then into sentences
    paragraphs = text.split("\n")
    all_sentences = []
    sentences_per_paragraph = []

    for paragraph in paragraphs:
        sentences = sent_tokenize(paragraph)
        sentences_per_paragraph.append(len(sentences))
        all_sentences.extend(sentences)

    # Process all sentences in batches
    paraphrased_sentences = []
    for i in progress.tqdm(range(0, len(all_sentences), batch_size)):
        batch_sentences = all_sentences[i : i + batch_size]
        paraphrased_batch = paraphrase_sentences(
            model, tokenizer, batch_sentences, temperature, repetition_penalty, top_k, length_penalty
        )
        paraphrased_sentences.extend(paraphrased_batch)

        # Clear memory
        torch.cuda.empty_cache()
        gc.collect()

    # Reconstruct paragraphs
    humanized_paragraphs = []
    sentence_index = 0
    for num_sentences in sentences_per_paragraph:
        humanized_paragraph = " ".join(paraphrased_sentences[sentence_index : sentence_index + num_sentences])
        humanized_paragraphs.append(humanized_paragraph)
        sentence_index += num_sentences

    humanized_text = "\n".join(humanized_paragraphs)
    return humanized_text