File size: 4,131 Bytes
a54c1ef
42f072b
 
 
 
d994b45
a54c1ef
42f072b
9d1ac35
42f072b
 
1ff94f4
42f072b
 
 
 
 
 
 
 
 
9d1ac35
a54c1ef
03fd59b
42f072b
 
 
 
a54c1ef
42f072b
9d1ac35
42f072b
a54c1ef
42f072b
a54c1ef
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9d1ac35
 
42f072b
 
d994b45
42f072b
 
 
 
 
 
a54c1ef
 
 
 
d994b45
 
a54c1ef
 
 
9d1ac35
a54c1ef
20d4ded
a54c1ef
 
9d1ac35
a54c1ef
20d4ded
a54c1ef
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20d4ded
84c08ee
20d4ded
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import gc
import torch
from nltk import sent_tokenize
import nltk
from tqdm import tqdm
import gradio as gr
from peft import PeftModel
from transformers import T5ForConditionalGeneration, T5Tokenizer

nltk.download("punkt")
# autodetect the available device
GPU_IDX = 1  # which GPU to use
if torch.cuda.is_available():
    num_gpus = torch.cuda.device_count()
    print(f"Number of available GPUs: {num_gpus}")
    assert GPU_IDX < num_gpus, f"GPU index {GPU_IDX} not available."
    device = torch.device(f"cuda:{GPU_IDX}")
    print(f"Using GPU: {GPU_IDX}")
else:
    print("CUDA is not available. Using CPU instead.")
    device = torch.device("cpu")

batch_size = 64

# Configuration for models and their adapters
model_config = {
    "Base Model": "polygraf-ai/poly-humanizer-base",
    "Large Model": "polygraf-ai/poly-humanizer-large",
    "XL Model": "polygraf-ai/poly-humanizer-XL-adapter",
}

# cache the base models, tokenizers, and adapters
# initialize model and tokenizer
models, tokenizers = {}, {}
for name, path in model_config.items():
    if name == "XL Model":
        model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-xl", torch_dtype=torch.bfloat16).to(device)
        model = PeftModel.from_pretrained(model, path, torch_dtype=torch.bfloat16, is_trainable=False)
        model = model.merge_and_unload()
        models[name] = model
        tokenizers[name] = T5Tokenizer.from_pretrained("google/flan-t5-xl")
    else:
        model = T5ForConditionalGeneration.from_pretrained(path, torch_dtype=torch.bfloat16).to(device)
        models[name] = model
        tokenizers[name] = T5Tokenizer.from_pretrained(path)
    print(f"Loaded model: {name}, Num. params: {model.num_parameters()}")


def paraphrase_sentences(model, tokenizer, sentences, temperature, repetition_penalty, top_k, length_penalty):
    inputs = ["Please paraphrase this sentence: " + sentence for sentence in sentences]
    inputs = tokenizer(inputs, return_tensors="pt", padding=True, truncation=True).to(model.device)
    outputs = model.generate(
        **inputs,
        do_sample=True,
        temperature=temperature,
        repetition_penalty=repetition_penalty,
        max_length=128,
        top_k=top_k,
        length_penalty=length_penalty,
    )
    answers = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
    return answers


def paraphrase_text(
    text,
    progress=gr.Progress(),
    model_name="Base Model",
    temperature=1.2,
    repetition_penalty=1.0,
    top_k=50,
    length_penalty=1.0,
):
    """
    Optimization here is to feed all sentences at once to the model.
    Paragraphs are stored as a number of sentences per paragraph.
    """
    progress(0, desc="Starting to Humanize")
    progress(0.05)
    # Select the model, tokenizer, and adapter
    tokenizer = tokenizers[model_name]
    model = models[model_name].to(device)

    # Split the text into paragraphs and then into sentences
    paragraphs = text.split("\n")
    all_sentences = []
    sentences_per_paragraph = []

    for paragraph in paragraphs:
        sentences = sent_tokenize(paragraph)
        sentences_per_paragraph.append(len(sentences))
        all_sentences.extend(sentences)

    # Process all sentences in batches
    paraphrased_sentences = []
    for i in range(0, len(all_sentences), batch_size):
        batch_sentences = all_sentences[i : i + batch_size]
        paraphrased_batch = paraphrase_sentences(
            model, tokenizer, batch_sentences, temperature, repetition_penalty, top_k, length_penalty
        )
        paraphrased_sentences.extend(paraphrased_batch)

        # Clear memory
        torch.cuda.empty_cache()
        gc.collect()

    # Reconstruct paragraphs
    humanized_paragraphs = []
    sentence_index = 0
    for num_sentences in sentences_per_paragraph:
        humanized_paragraph = " ".join(paraphrased_sentences[sentence_index : sentence_index + num_sentences])
        humanized_paragraphs.append(humanized_paragraph)
        sentence_index += num_sentences

    humanized_text = "\n".join(humanized_paragraphs)
    return humanized_text