File size: 6,358 Bytes
a86ebfe
074b087
4f600ec
 
 
 
64547bc
074b087
336e489
a86ebfe
336e489
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a86ebfe
 
 
 
 
 
 
 
074b087
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fc3e5ca
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
074b087
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import tweepy as tw
import streamlit as st
import pandas as pd
import torch
import numpy as np
import regex as re
import pysentimiento

from pysentimiento.preprocessing import preprocess_tweet

from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AutoTokenizer, AutoModelForSequenceClassification,AdamW
tokenizer = AutoTokenizer.from_pretrained('hackathon-pln-es/twitter_sexismo-finetuned-robertuito-exist2021')
model = AutoModelForSequenceClassification.from_pretrained("hackathon-pln-es/twitter_sexismo-finetuned-robertuito-exist2021")

import torch
if torch.cuda.is_available():  
    device = torch.device(	"cuda")
    print('I will use the GPU:', torch.cuda.get_device_name(0))
    
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

    
consumer_key = "BjipwQslVG4vBdy4qK318KnoA"
consumer_secret = "3fzL70v9faklrPgvTi3zbofw9rwk92fgGdtAslFkFYt8kGmqBJ"
access_token = "1217853705086799872-Y5zEChpTeKccuLY3XJRXDPPZhNrlba"
access_token_secret = "pqQ5aFSJxzJ2xnI6yhVtNjQO36FOu8DBOH6DtUrPAU54J"
auth = tw.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tw.API(auth, wait_on_rate_limit=True)

st.set_page_config(layout="wide")
st.markdown('<style>body{background-color: Blue;}</style>',unsafe_allow_html=True)

colT1,colT2 = st.columns([2,8])
with colT2:
   # st.title('Analisis de comentarios sexistas en Twitter') 
    st.markdown(""" <style> .font {
    font-size:40px ; font-family: 'Cooper Black'; color: #06bf69;} 
    </style> """, unsafe_allow_html=True)
    st.markdown('<p class="font">Análisis de comentarios sexistas en Twitter</p>', unsafe_allow_html=True)
    
    st.markdown(""" <style> .font1 {
    font-size:28px ; font-family: 'Times New Roman'; color: #8d33ff;} 
    </style> """, unsafe_allow_html=True)

    st.markdown(""" <style> .font2 {
    font-size:16px ; font-family: 'Times New Roman'; color: #3358ff;} 
    </style> """, unsafe_allow_html=True)

   
def run():   
 with st.form("my_form"):
   col,buff1, buff2 = st.columns([2,2,1])
   st.write("Escoja una Opción")
   search_words = col.text_input("Introduzca el termino o usuario para analizar y pulse el check correspondiente")
   number_of_tweets = col.number_input('Introduzca número de twweets a analizar. Máximo 50', 0,50,10)
   termino=st.checkbox('Término')
   usuario=st.checkbox('Usuario')
   submit_button = col.form_submit_button(label='Analizar')
   error=False
   if submit_button:
            date_since = "2020-09-14"
            if ( termino == False and usuario == False):
                st.text('Error no se ha seleccionado ningun check')
                error=True
            elif ( termino == True and usuario == True):
                st.text('Error se han seleccionado los dos check')
                error=True
                
            if (error == False):
                if (termino):
                    new_search = search_words + " -filter:retweets"
                    tweets =tw.Cursor(api.search_tweets,q=new_search,lang="es",since=date_since).items(number_of_tweets)
                elif (usuario):
                    tweets = api.user_timeline(screen_name = search_words,count=number_of_tweets)
                
                tweet_list = [i.text for i in tweets]
                #tweet_list = [strip_undesired_chars(i.text) for i in tweets]
                text= pd.DataFrame(tweet_list)
                #text[0] = text[0].apply(preprocess)
                text[0] = text[0].apply(preprocess_tweet)
                text1=text[0].values
                indices1=tokenizer.batch_encode_plus(text1.tolist(),
                                         max_length=128,
                                         add_special_tokens=True, 
                                         return_attention_mask=True,
                                         pad_to_max_length=True,
                                         truncation=True)
                input_ids1=indices1["input_ids"]
                attention_masks1=indices1["attention_mask"]
                prediction_inputs1= torch.tensor(input_ids1)
                prediction_masks1 = torch.tensor(attention_masks1)
                # Set the batch size.  
                batch_size = 25
                # Create the DataLoader.
                prediction_data1 = TensorDataset(prediction_inputs1, prediction_masks1)
                prediction_sampler1 = SequentialSampler(prediction_data1)
                prediction_dataloader1 = DataLoader(prediction_data1, sampler=prediction_sampler1, batch_size=batch_size)
                print('Predicting labels for {:,} test sentences...'.format(len(prediction_inputs1)))
                # Put model in evaluation mode
                model.eval()
                # Tracking variables 
                predictions = []
                # Predict 
                for batch in prediction_dataloader1:
                    batch = tuple(t.to(device) for t in batch)
                    # Unpack the inputs from our dataloader
                    b_input_ids1, b_input_mask1 = batch
                    # Telling the model not to compute or store gradients, saving memory and   # speeding up prediction
                    with torch.no_grad():
                        # Forward pass, calculate logit predictions
                        outputs1 = model(b_input_ids1, token_type_ids=None,attention_mask=b_input_mask1)
                    logits1 = outputs1[0]
                    # Move logits and labels to CPU
                    logits1 = logits1.detach().cpu().numpy()
                    # Store predictions and true labels
                    predictions.append(logits1)
                flat_predictions = [item for sublist in predictions for item in sublist]
                flat_predictions = np.argmax(flat_predictions, axis=1).flatten()#p = [i for i in classifier(tweet_list)]
                df = pd.DataFrame(list(zip(tweet_list, flat_predictions)),columns =['Últimos '+ str(number_of_tweets)+' Tweets'+' de '+search_words, 'Sexista'])
                df['Sexista']= np.where(df['Sexista']== 0, 'No Sexista', 'Sexista')
                
                
                st.table(df.reset_index(drop=True).head(20).style.applymap(color_survived, subset=['Sexista']))

run()