Spaces:
Running
Running
File size: 18,543 Bytes
313587b a5c6b5f bb73643 a5c6b5f bb73643 a5c6b5f 43cbfbd a5c6b5f 313587b a5c6b5f 313587b a5c6b5f b76aadb 4dd98b8 b76aadb a5c6b5f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 |
import gradio as gr
from minivectordb.embedding_model import EmbeddingModel
from minivectordb.vector_database import VectorDatabase
from multiprocessing import cpu_count
from functools import lru_cache
import fasttext, random, tiktoken, os, pickle
import concurrent.futures
os.environ['TOKENIZERS_PARALLELISM'] = 'true'
langdetect_model = fasttext.load_model('lid.176.ftz')
embedding_model = EmbeddingModel(onnx_model_cpu_core_count=1)
en_stop_words = pickle.load(open("en_stopwords.pkl", "rb"))
pt_stop_words = pickle.load(open("pt_stopwords.pkl", "rb"))
tokenizer = tiktoken.encoding_for_model("gpt-4")
def count_tokens_tiktoken(text):
return len(tokenizer.encode(text))
def detect_language_en_pt(text):
detected_lang = langdetect_model.predict(text.replace('\n', ' '), k=1)[0][0]
result = str(detected_lang).replace('__label__', '')
if result == 'pt':
return 'pt'
return 'en'
def generate_combinations(text, word_reduction_factor, stopwords, semantic_embeddings, num_samples=100, keep_tokens=None):
if keep_tokens is None:
keep_tokens = {"\n", ".", ",", ";", "!", "?"}
if word_reduction_factor is None:
word_reduction_factor = 0.5
words = text.split()
total_words = len(words)
num_remove = int(total_words * word_reduction_factor)
# Update index identification to exclude keep_tokens
stopword_indices = [i for i, word in enumerate(words) if word.lower() in stopwords and word not in keep_tokens]
non_stopword_indices = [i for i, word in enumerate(words) if word.lower() not in stopwords and word not in keep_tokens]
non_stopword_words = [word for i, word in enumerate(words) if i in non_stopword_indices]
# Get the embeddings for the non-stopword words
non_stopword_embeddings = extract_embeddings_batch(non_stopword_words)
# Calculate the cosine similarity between the original text embedding and the non-stopword words
original_text_embedding = semantic_embeddings
# Calculate the cosine similarity between the original text embedding and the non-stopword words
semantic_db = VectorDatabase()
ids = [i for i in range(len(non_stopword_words))]
metadata_dicts = [{"w": word} for word in non_stopword_words]
semantic_db.store_embeddings_batch(ids, non_stopword_embeddings, metadata_dicts)
_, _, ordered_words_metadata = semantic_db.find_most_similar(original_text_embedding, k=len(non_stopword_words))
ordered_words = [meta['w'] for meta in ordered_words_metadata]
# Create a mapping from word to index for quick lookup
word_to_index = {word: i for i, word in enumerate(words)}
# Get the ordered indices based on semantic importance (less important words last)
ordered_indices = [word_to_index[word] for word in ordered_words if word in word_to_index]
# Determine the high-priority words to always keep
high_priority_count = len(ordered_indices) - num_remove
high_priority_count = max(high_priority_count, 0) # Ensure it's not negative
high_priority_indices = ordered_indices[:high_priority_count]
combinations = []
for _ in range(num_samples):
# Calculate remaining words to remove
remaining_remove = num_remove
# Ensure we don't try to sample more items than exist
if len(stopword_indices) > 0:
num_stop = random.randint(0, min(remaining_remove, len(stopword_indices)))
else:
num_stop = 0
remaining_remove -= num_stop
if remaining_remove > 0:
lower_priority_indices = ordered_indices[high_priority_count:]
num_non_stop = min(remaining_remove, len(lower_priority_indices)) # Ensure we don't sample more than available
prioritized_non_stop_indices = random.sample(lower_priority_indices, num_non_stop) if num_non_stop > 0 else []
else:
prioritized_non_stop_indices = []
stop_comb = random.sample(stopword_indices, num_stop) if num_stop > 0 else []
combination = set(stop_comb + prioritized_non_stop_indices)
new_string = [word for i, word in enumerate(words) if i not in combination or i in high_priority_indices]
combinations.append(' '.join(new_string))
return list(set(combinations))
@lru_cache(maxsize=50000)
def extract_embeddings(text):
return embedding_model.extract_embeddings(text)
def extract_embeddings_batch(texts):
return [extract_embeddings(text) for text in texts]
def compress_semantically(input_text, word_reduction_factor=0.35):
num_samples = 500
word_count = len(input_text.split())
thresholds = [(1500, 80), (1000, 90), (700, 110), (500, 130), (250, 160)]
for threshold, value in thresholds:
if word_count > threshold:
num_samples = value
break
semantic_embeddings = extract_embeddings(input_text)
text_lang = detect_language_en_pt(input_text)
stopwords = en_stop_words if text_lang == 'en' else pt_stop_words
text_combinations = generate_combinations(input_text, word_reduction_factor, stopwords, semantic_embeddings, num_samples=num_samples)
n = int(num_samples / cpu_count())
# Aggregate text_combinations into blocks of "n"
text_combinations_chunks = [text_combinations[i:i + n] for i in range(0, len(text_combinations), n)]
# Calculate the embeddings for each combination
combinations_embeddings = []
with concurrent.futures.ProcessPoolExecutor(max_workers=cpu_count()) as executor:
for embeddings in executor.map(extract_embeddings_batch, text_combinations_chunks):
combinations_embeddings.extend(embeddings)
semantic_db = VectorDatabase()
unique_ids = [ i for i in range(len(text_combinations)) ]
metadata_dicts = [ {"text": text} for text in text_combinations ]
semantic_db.store_embeddings_batch(unique_ids, combinations_embeddings, metadata_dicts)
_, _, result = semantic_db.find_most_similar(semantic_embeddings, k=1)
best_compressed_sentence = result[0]['text']
return best_compressed_sentence
async def predict(text, word_reduction_factor):
if len(text.split()) > 700:
return "Text is too long for this demo. Please provide a text with less than 700 words."
compressed = compress_semantically(text, word_reduction_factor = word_reduction_factor)
perc_reduction = round(100 - (count_tokens_tiktoken(compressed) / count_tokens_tiktoken(text)) * 100, 2)
return f"{compressed}\n\nToken Reduction: {perc_reduction}%"
gradio_examples = [
"""Almost 30 years ago, a revolutionary idea changed the way Europe regarded road collisions. It has probably saved countless lives but it's yet to be fully accepted by politicians. In 1995, a serious crash occurred on the E4 motorway near Stockholm, Sweden. Five young people were travelling in a hatchback car when the vehicle went into a roll near the exit ramp for the Ikea store. The car smashed into a concrete structure supporting a streetlight by the side of the road, and all five passengers were killed. "I am rather sure they were speeding, and as it was wet, they probably aquaplaned," says Claes Tingvall. Almost 30 years on, he struggles to remember all the details – but he is sure about one thing: "The car was a three-door Peugeot 205 GTI, red." More than 500 people died on Sweden's roads that year, but this tragedy signalled a turning point in how Tingvall, and eventually the world, regarded road crashes. An estimated 1.2 million lives are cut short by road traffic collisions globally each year, while millions more suffer often life-changing injuries. While the death toll has decreased slightly over the past 13 years – the number of fatalities on the world's roads are 5% lower than they were in 2010 according to the World Health Organization (WHO) – progress has been slow and falls far short of the WHO's target of halving the number of road deaths by the end of this decade. Today, Sweden has some of the lowest rates of road traffic fatalities in the world, and the story of how the country has strived to bring that number to zero provides a lesson for other countries where the death toll has remained stubbornly high. Back in 1995, Tingvall had become the head of road safety for the Swedish Road Adminstration. He was very well qualified for the role, but quite unlike any of his predecessors. Instead of coming up through the ranks of road transport engineers and bureaucrats, Tingvall had a medical background: he had studied at the renowned Karolinska Institute, where he had gained a doctorate in the epidemiology of injuries.""",
"""Há quase 30 anos, uma ideia revolucionária mudou a forma como a Europa encarava as colisões rodoviárias. Provavelmente salvou inúmeras vidas, mas ainda não foi totalmente aceite pelos políticos. Em 1995, ocorreu um grave acidente na autoestrada E4, perto de Estocolmo, na Suécia. Cinco jovens viajavam em um carro hatch quando o veículo capotou perto da rampa de saída da loja Ikea. O carro bateu em uma estrutura de concreto que sustentava um poste de luz na beira da estrada e todos os cinco passageiros morreram. “Tenho certeza de que eles estavam em alta velocidade e, como estava molhado, provavelmente aquaplanaram”, diz Claes Tingvall. Quase 30 anos depois, ele luta para lembrar de todos os detalhes – mas de uma coisa tem certeza: “O carro era um Peugeot 205 GTI de três portas, vermelho”. Mais de 500 pessoas morreram nas estradas da Suécia naquele ano, mas esta tragédia assinalou um ponto de viragem na forma como Tingvall, e eventualmente o mundo, encaravam os acidentes rodoviários. Estima-se que 1,2 milhões de vidas sejam ceifadas por colisões rodoviárias em todo o mundo todos os anos, enquanto outros milhões sofrem frequentemente lesões que alteram as suas vidas. Embora o número de mortos tenha diminuído ligeiramente ao longo dos últimos 13 anos – o número de vítimas mortais nas estradas mundiais é 5% inferior ao de 2010, segundo a Organização Mundial de Saúde (OMS) – o progresso tem sido lento e fica muito aquém do esperado. A meta da OMS de reduzir para metade o número de mortes nas estradas até ao final desta década. Hoje, a Suécia tem algumas das taxas de mortalidade rodoviária mais baixas do mundo, e a história de como o país se esforçou para reduzir esse número a zero fornece uma lição para outros países onde o número de mortes permaneceu teimosamente elevado. Em 1995, Tingvall tornou-se chefe de segurança rodoviária da Administração Rodoviária Sueca. Ele estava muito bem qualificado para o papel, mas muito diferente de qualquer um de seus antecessores. Em vez de ascender na hierarquia de engenheiros e burocratas de transporte rodoviário, Tingvall tinha formação médica: estudou no renomado Instituto Karolinska, onde obteve o doutorado em epidemiologia de lesões.""",
"""Akin to France's heartier, spicier, richer boeuf bourguignon, "alcatra" is synonymous with a single island in the remote Azores archipelago. The Azores, an archipelago of nine islands belonging to Portugal and located roughly between Europe and the US, are cow country. They're said to be home to more cattle than people, and despite being home to less than 3% of Portugal's population, the islands produce 30% of Portugal's dairy products and 13% of its beef. Beef is part of everyday life in the Azores, and come spring on one particular island, the ingredient even crosses paths with religion. In the days following Easter, Azorean people kick off a series of religious celebrations called Festas do Espírito Santo (Festivals of the Holy Spirit). During the 13th Century, a Catholic sect called the Cult of the Holy Spirit predicted a utopian era on Earth. This fringe faith was discouraged in mainland Europe but lived on in these remote islands in the middle of the Atlantic Ocean. The sect was also promoted by Portuguese queen Elizabeth of Aragon (also known as Elizabeth of Portugal), who was known for her charity. Over the subsequent centuries, a series of festivals emerged on the Azores that blended these utopian aspirations with the queen's alleged generosity. Between Easter and the week following Whitsunday, a total of eight weeks, the islands host a series of parades and other cultural and religious festivals that revolve around brightly coloured community houses called impérios. During this time, the community houses also collect donations from locals, which is then redistributed to people in the form of bread, beef and wine. These three elements generally come together in the form of a soup, called sopa do Espírito Santo, that's served at the impérios during the festivals. But on the island of Terceira, locals combine these ingredients in a different and delicious way, one that's become synonymous with the island's culinary identity. Austin Bush The Festas do Espírito Santo revolve around community houses called impérios (Credit: Austin Bush)Austin Bush The Festas do Espírito Santo revolve around community houses called impérios (Credit: Austin Bush) "People eat alcatra year round, but especially during the celebrations in spring and summer," explains Duarte Fournier. He is the Grand Master of the Brotherhood of Alcatra, a culinary fraternity on Terceira, and is telling me about the island's signature dish: cuts of beef braised in local wine, smoked pork fat and dried spices, resulting in something of a heartier, spicier, richer version of France's famed boeuf bourguignon. We're sitting at a cafe in Angra do Heroísmo, Terceira's largest city, and as we chat, children race to and from a nearby império delivering massive trays of raw beef to neighbours. Fournier tells me that alcatra likely has its origins in northern Portugal, where there's a tradition of baking goat in wine. "We don't know why it's called alcatra," he says. "We suppose it's from Arabic. Al catar means 'small pieces of meat'." According to Fournier, alcatra differs from mainland Portugal's baked meat dishes in that it includes dried spices, generally allspice and black peppercorns, but also sometimes clove or cinnamon.""",
"""Semelhante ao boeuf bourguignon mais vigoroso, picante e rico da França, "alcatra" é sinônimo de uma única ilha no remoto arquipélago dos Açores. Os Açores, um arquipélago de nove ilhas pertencentes a Portugal e localizado aproximadamente entre a Europa e os EUA, são um país de vacas. Diz-se que abrigam mais gado do que pessoas e, apesar de abrigarem menos de 3% da população de Portugal, as ilhas produzem 30% dos produtos lácteos de Portugal e 13% da sua carne bovina. A carne bovina faz parte do quotidiano dos Açores e, quando chega a primavera numa determinada ilha, o ingrediente cruza até com a religião. Nos dias seguintes à Páscoa, os açorianos dão início a uma série de celebrações religiosas denominadas Festas do Espírito Santo. Durante o século 13, uma seita católica chamada Culto do Espírito Santo previu uma era utópica na Terra. Esta fé marginal foi desencorajada na Europa continental, mas sobreviveu nestas ilhas remotas no meio do Oceano Atlântico. A seita também foi promovida pela rainha portuguesa Isabel de Aragão (também conhecida como Isabel de Portugal), que era conhecida pela sua caridade. Ao longo dos séculos seguintes, surgiu nos Açores uma série de festivais que misturavam estas aspirações utópicas com a alegada generosidade da rainha. Entre a Páscoa e a semana seguinte ao Domingo de Pentecostes, num total de oito semanas, as ilhas acolhem uma série de desfiles e outros festivais culturais e religiosos que giram em torno de casas comunitárias de cores vivas chamadas impérios. Durante esse período, as casas comunitárias também arrecadam doações dos moradores, que depois são redistribuídas às pessoas na forma de pão, carne e vinho. Estes três elementos juntam-se geralmente na forma de uma sopa, chamada sopa do Espírito Santo, que é servida nos impérios durante as festas. Mas na ilha Terceira os cariocas combinam estes ingredientes de uma forma diferente e deliciosa, que se tornou sinónimo da identidade culinária da ilha. Austin Bush As Festas do Espírito Santo giram em torno de casas comunitárias chamadas impérios (Crédito: Austin Bush)Austin Bush As Festas do Espírito Santo giram em torno de casas comunitárias chamadas impérios (Crédito: Austin Bush) "As pessoas comem alcatra o ano todo, mas principalmente durante as comemorações na primavera e no verão", explica Duarte Fournier. Ele é o Grão-Mestre da Irmandade de Alcatra, uma fraternidade culinária da Terceira, e está me contando sobre o prato característico da ilha: cortes de carne refogados no vinho local, gordura de porco defumada e especiarias secas, resultando em um prato mais forte e picante. , versão mais rica do famoso boeuf bourguignon da França. Estamos sentados num café em Angra do Heroísmo, a maior cidade da Terceira, e enquanto conversamos, crianças correm de e para um império próximo, entregando enormes bandejas de carne crua aos vizinhos. Fournier disse-me que a alcatra provavelmente tem a sua origem no norte de Portugal, onde existe uma tradição de assar cabra no vinho. “Não sabemos por que se chama alcatra”, diz ele. "Supomos que seja do árabe. Al catar significa 'pequenos pedaços de carne'." Segundo Fournier, a alcatra difere dos pratos de carne assada de Portugal continental por incluir especiarias secas, geralmente pimenta da Jamaica e pimenta preta, mas por vezes também cravo ou canela."""
]
gradio_examples = [ [text] for text in gradio_examples ]
gradio_title = "Semantic Compression [ English / Portuguese ]"
gradio_description = "Provide a text and the system will compress it, trying to preserve the original meaning. The system uses semantic embeddings to compress the text. The word reduction factor controls how much the text will be compressed. The higher the value, the more compressed the text will be."
reduction_factor = gr.Slider(
minimum=0.1,
maximum=0.9,
value=0.5,
step=0.05,
interactive=True,
label="Word Reduction Factor"
)
# Create the gradio interface
gr.Interface(
fn=predict,
inputs=[gr.Textbox(lines=10, label="Input Text"), reduction_factor],
outputs=[gr.Textbox(label="Compressed Text")],
title=gradio_title,
description=gradio_description,
examples=gradio_examples,
allow_flagging="never"
).launch() |