Spaces:
Running
Running
dataset model csv ingles
Browse files
app.py
CHANGED
@@ -67,7 +67,7 @@ class Model:
|
|
67 |
new_tokens=[]
|
68 |
ig_tokens=[]
|
69 |
for token in tokens:
|
70 |
-
print('token_texto:',token,caracter)
|
71 |
ind=len(new_tokens)
|
72 |
if i<len(tokens):
|
73 |
if not token.startswith(caracter):
|
@@ -324,14 +324,14 @@ class ModeloDataset:
|
|
324 |
self.idioma=""
|
325 |
self.modelo_ner=""
|
326 |
self.categoria_texto=""
|
327 |
-
self.tokenizer = AutoTokenizer.from_pretrained("BSC-LT/roberta_model_for_anonimization")
|
328 |
def reordenacion_tokens(self,tokens,caracter):
|
329 |
|
330 |
i=0
|
331 |
new_tokens=[]
|
332 |
ig_tokens=[]
|
333 |
for token in tokens:
|
334 |
-
|
335 |
ind=len(new_tokens)
|
336 |
if i<len(tokens):
|
337 |
if not token.startswith(caracter):
|
@@ -477,7 +477,7 @@ class ModeloDataset:
|
|
477 |
print('idioma:',idioma)
|
478 |
self.tokenizer = AutoTokenizer.from_pretrained("dayannex/distilbert-tuned-4labels")
|
479 |
self.model = AutoModelForTokenClassification.from_pretrained("dayannex/distilbert-tuned-4labels")
|
480 |
-
sentences_list = _sentences.tolist()
|
481 |
inputs = self.tokenizer(sentences_list, padding=True, truncation=True, return_tensors="pt", max_length=512)
|
482 |
with torch.no_grad():
|
483 |
outputs = self.model(**inputs)
|
@@ -662,7 +662,8 @@ def procesar(texto,archivo, etiquetas):
|
|
662 |
|
663 |
if archivo.name.split(".")[1]=="csv":
|
664 |
print('csv')
|
665 |
-
df=pd.read_csv(archivo.name,delimiter=";",encoding='latin-1')
|
|
|
666 |
|
667 |
df_new = pd.DataFrame( columns=df.columns.values)
|
668 |
model.identificacion_idioma(df.iloc[0][0])
|
|
|
67 |
new_tokens=[]
|
68 |
ig_tokens=[]
|
69 |
for token in tokens:
|
70 |
+
#print('token_texto:',token,caracter)
|
71 |
ind=len(new_tokens)
|
72 |
if i<len(tokens):
|
73 |
if not token.startswith(caracter):
|
|
|
324 |
self.idioma=""
|
325 |
self.modelo_ner=""
|
326 |
self.categoria_texto=""
|
327 |
+
#self.tokenizer = AutoTokenizer.from_pretrained("BSC-LT/roberta_model_for_anonimization")
|
328 |
def reordenacion_tokens(self,tokens,caracter):
|
329 |
|
330 |
i=0
|
331 |
new_tokens=[]
|
332 |
ig_tokens=[]
|
333 |
for token in tokens:
|
334 |
+
|
335 |
ind=len(new_tokens)
|
336 |
if i<len(tokens):
|
337 |
if not token.startswith(caracter):
|
|
|
477 |
print('idioma:',idioma)
|
478 |
self.tokenizer = AutoTokenizer.from_pretrained("dayannex/distilbert-tuned-4labels")
|
479 |
self.model = AutoModelForTokenClassification.from_pretrained("dayannex/distilbert-tuned-4labels")
|
480 |
+
sentences_list = _sentences.tolist()
|
481 |
inputs = self.tokenizer(sentences_list, padding=True, truncation=True, return_tensors="pt", max_length=512)
|
482 |
with torch.no_grad():
|
483 |
outputs = self.model(**inputs)
|
|
|
662 |
|
663 |
if archivo.name.split(".")[1]=="csv":
|
664 |
print('csv')
|
665 |
+
#df=pd.read_csv(archivo.name,delimiter=";",encoding='latin-1')
|
666 |
+
df=pd.read_csv(archivo.name,delimiter=";")
|
667 |
|
668 |
df_new = pd.DataFrame( columns=df.columns.values)
|
669 |
model.identificacion_idioma(df.iloc[0][0])
|