import pandas as pd import tweepy import re import emoji import spacy import gensim import json import string from spacy.tokenizer import Tokenizer from gensim.parsing.preprocessing import STOPWORDS as SW from wordcloud import STOPWORDS from gensim.corpora import Dictionary from gensim.models.coherencemodel import CoherenceModel from pprint import pprint import numpy as np import tqdm from gensim.parsing.preprocessing import preprocess_string, strip_punctuation, strip_numeric import torch from transformers import T5ForConditionalGeneration,T5Tokenizer from googletrans import Translator from bertopic import BERTopic from umap import UMAP from sklearn.feature_extraction.text import CountVectorizer from operator import itemgetter import gradio as gr from transformers import AutoTokenizer, AutoModelForSeq2SeqLM global df bearer_token = 'AAAAAAAAAAAAAAAAAAAAACEigwEAAAAACoP8KHJYLOKCL4OyB9LEPV00VB0%3DmyeDROUvw4uipHwvbPPfnTuY0M9ORrLuXrMvcByqZhwo3SUc4F' client = tweepy.Client(bearer_token=bearer_token) nlp = spacy.load('en_core_web_lg') print('hi') def scrape(keywords): query = keywords + ' (lang:en OR lang:tl) -is:retweet' max_results = 100 tweet_fields=['geo', 'id', 'lang', 'created_at'] expansions=['geo.place_id'] place_fields = ['contained_within', 'country', 'country_code', 'full_name', 'geo', 'id', 'name', 'place_type'] response = client.search_recent_tweets( query=query, max_results=max_results, tweet_fields=tweet_fields, expansions=expansions, place_fields=place_fields ) tweets = [] for x in response[0]: tweets.append(str(x)) place_data = response[1] df = pd.DataFrame(tweets, columns=['tweet']) return place_data def get_example(dataset): df = pd.read_csv(dataset + '.csv') return df def give_emoji_free_text(text): """ Removes emoji's from tweets Accepts: Text (tweets) Returns: Text (emoji free tweets) """ emoji_list = [c for c in text if c in emoji.EMOJI_DATA] clean_text = ' '.join([str for str in text.split() if not any(i in str for i in emoji_list)]) return clean_text def url_free_text(text): ''' Cleans text from urls ''' text = re.sub(r'http\S+', '', text) return text def get_lemmas(text): '''Used to lemmatize the processed tweets''' lemmas = [] doc = nlp(text) for token in doc: if ((token.is_stop == False) and (token.is_punct == False)) and (token.pos_ != 'PRON'): lemmas.append(token.lemma_) return lemmas # Tokenizer function def tokenize(text): """ Parses a string into a list of semantic units (words) Args: text (str): The string that the function will tokenize. Returns: list: tokens parsed out """ # Removing url's pattern = r"http\S+" tokens = re.sub(pattern, "", text) # https://www.youtube.com/watch?v=O2onA4r5UaY tokens = re.sub('[^a-zA-Z 0-9]', '', text) tokens = re.sub('[%s]' % re.escape(string.punctuation), '', text) # Remove punctuation tokens = re.sub('\w*\d\w*', '', text) # Remove words containing numbers # tokens = re.sub('@*!*$*', '', text) # Remove @ ! $ tokens = tokens.strip(',') # TESTING THIS LINE tokens = tokens.strip('?') # TESTING THIS LINE tokens = tokens.strip('!') # TESTING THIS LINE tokens = tokens.strip("'") # TESTING THIS LINE tokens = tokens.strip(".") # TESTING THIS LINE tokens = tokens.lower().split() # Make text lowercase and split it return tokens def split_corpus(corpus, n): for i in range(0, len(corpus), n): corpus_split = corpus yield corpus_split[i:i + n] def compute_coherence_values_base_lda(dictionary, corpus, texts, limit, coherence, start=2, step=1): coherence_values = [] model_list = [] for num_topics in range(start, limit, step): model = gensim.models.ldamodel.LdaModel(corpus=corpus, num_topics=num_topics, random_state=100, chunksize=200, passes=10, per_word_topics=True, id2word=id2word) model_list.append(model) coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence=coherence) coherence_values.append(coherencemodel.get_coherence()) return model_list, coherence_values def compute_coherence_values2(corpus, dictionary, k, a, b): lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=id2word, num_topics=num_topics, random_state=100, chunksize=200, passes=10, alpha=a, eta=b, per_word_topics=True) coherence_model_lda = CoherenceModel(model=lda_model, texts=df['lemma_tokens'], dictionary=id2word, coherence='c_v') return coherence_model_lda.get_coherence() def assignMaxTopic(l): maxTopic = max(l,key=itemgetter(1))[0] return maxTopic def assignTopic(l): topics = [] for x in l: topics.append(x[0]) def get_topic_value(row, i): if len(row) == 1: return row[0][1] else: try: return row[i][1] except Exception as e: print(e) def full_lda(df): print('cleaning') df.rename(columns = {'tweet':'original_tweets'}, inplace = True) # Apply the function above and get tweets free of emoji's call_emoji_free = lambda x: give_emoji_free_text(x) # Apply `call_emoji_free` which calls the function to remove all emoji's df['emoji_free_tweets'] = df['original_tweets'].apply(call_emoji_free) #Create a new column with url free tweets df['url_free_tweets'] = df['emoji_free_tweets'].apply(url_free_text) f = open('stopwords-tl.json') tlStopwords = json.loads(f.read()) stopwords = set(STOPWORDS) stopwords.update(tlStopwords) stopwords.update(['na', 'sa', 'ko', 'ako', 'ng', 'mga', 'ba', 'ka', 'yung', 'lang', 'di', 'mo', 'kasi']) # Tokenizer tokenizer = Tokenizer(nlp.vocab) # Custom stopwords custom_stopwords = ['hi','\n','\n\n', '&', ' ', '.', '-', 'got', "it's", 'it’s', "i'm", 'i’m', 'im', 'want', 'like', '$', '@'] # Customize stop words by adding to the default list STOP_WORDS = nlp.Defaults.stop_words.union(custom_stopwords) # ALL_STOP_WORDS = spacy + gensim + wordcloud ALL_STOP_WORDS = STOP_WORDS.union(SW).union(stopwords) tokens = [] STOP_WORDS.update(stopwords) for doc in tokenizer.pipe(df['url_free_tweets'], batch_size=500): doc_tokens = [] for token in doc: if token.text.lower() not in STOP_WORDS: doc_tokens.append(token.text.lower()) tokens.append(doc_tokens) # Makes tokens column df['tokens'] = tokens # Make tokens a string again df['tokens_back_to_text'] = [' '.join(map(str, l)) for l in df['tokens']] df['lemmas'] = df['tokens_back_to_text'].apply(get_lemmas) # Make lemmas a string again df['lemmas_back_to_text'] = [' '.join(map(str, l)) for l in df['lemmas']] # Apply tokenizer df['lemma_tokens'] = df['lemmas_back_to_text'].apply(tokenize) print('base model setup') # Create a id2word dictionary global id2word id2word = Dictionary(df['lemma_tokens']) # Filtering Extremes id2word.filter_extremes(no_below=2, no_above=.99) # Creating a corpus object global corpus corpus = [id2word.doc2bow(d) for d in df['lemma_tokens']] global corpus_og corpus_og = [id2word.doc2bow(d) for d in df['lemma_tokens']] corpus_split = corpus split_corpus(corpus_split, 5) global coherence coherence = 'c_v' coherence_averages = [0] * 8 for i in range(5): training_corpus = corpus_split training_corpus.remove(training_corpus[i]) # print(training_corpus[i]) model_list, coherence_values = compute_coherence_values_base_lda(dictionary=id2word, corpus=training_corpus, texts=df['lemma_tokens'], start=2, limit=10, step=1, coherence=coherence) for j in range(len(coherence_values)): coherence_averages[j] += coherence_values[j] limit = 10; start = 2; step = 1; x = range(start, limit, step) coherence_averages = [x / 5 for x in coherence_averages] if coherence == 'c_v': k_max = max(coherence_averages) else: k_max = min(coherence_averages, key=abs) global num_topics num_topics = coherence_averages.index(k_max) + 2 print('hyperparameter opt') grid = {} grid['Validation_Set'] = {} min_topics = 1 max_topics = 10 step_size = 1 topics_range = range(min_topics, max_topics, step_size) alpha = [0.05, 0.1, 0.5, 1, 5, 10] # alpha.append('symmetric') # alpha.append('asymmetric') beta = [0.05, 0.1, 0.5, 1, 5, 10] # beta.append('symmetric') num_of_docs = len(corpus_og) corpus_sets = [gensim.utils.ClippedCorpus(corpus_og, int(num_of_docs*0.75)), corpus_og] corpus_title = ['75% Corpus', '100% Corpus'] model_results = {'Validation_Set': [], 'Alpha': [], 'Beta': [], 'Coherence': [] } if 1 == 1: pbar = tqdm.tqdm(total=540) for i in range(len(corpus_sets)): for a in alpha: for b in beta: cv = compute_coherence_values2(corpus=corpus_sets[i], dictionary=id2word, k=num_topics, a=a, b=b) model_results['Validation_Set'].append(corpus_title[i]) model_results['Alpha'].append(a) model_results['Beta'].append(b) model_results['Coherence'].append(cv) pbar.update(1) pd.DataFrame(model_results).to_csv('lda_tuning_results_new.csv', index=False) pbar.close() params_df = pd.read_csv('lda_tuning_results_new.csv') params_df = params_df[params_df.Validation_Set == '75% Corpus'] params_df.reset_index(inplace=True) params_df = params_df.replace(np.inf, -np.inf) max_params = params_df.loc[params_df['Coherence'].idxmax()] max_coherence = max_params['Coherence'] max_alpha = max_params['Alpha'] max_beta = max_params['Beta'] max_validation_set = max_params['Validation_Set'] global lda_model_final lda_model_final = gensim.models.ldamodel.LdaModel(corpus=corpus_og, id2word=id2word, num_topics=num_topics, random_state=100, chunksize=200, passes=10, alpha=max_alpha, eta=max_beta, per_word_topics=True) coherence_model_lda = CoherenceModel(model=lda_model_final, texts=df['lemma_tokens'], dictionary=id2word, coherence='c_v') coherence_lda = coherence_model_lda.get_coherence() lda_topics = lda_model_final.show_topics(num_words=10) print('assign topics') topics = [] filters = [lambda x: x.lower(), strip_punctuation, strip_numeric] for topic in lda_topics: topics.append(preprocess_string(topic[1], filters)) df['topic'] = [sorted(lda_model_final[corpus_og][text][0]) for text in range(len(df['original_tweets']))] df = df[df['topic'].map(lambda d: len(d)) > 0] df['max_topic'] = df['topic'].map(lambda row: assignMaxTopic(row)) global topic_clusters topic_clusters = [] for i in range(num_topics): topic_clusters.append(df[df['max_topic'].isin(([i]))]) topic_clusters[i] = topic_clusters[i]['original_tweets'].tolist() print('rep topics') global top_tweets top_tweets = [] for i in range(len(topic_clusters)): tweets = df.loc[df['max_topic'] == i] tweets['topic'] = tweets['topic'].apply(lambda x: get_topic_value(x, i)) # tweets['topic'] = [row[i][1] for row in tweets['topic']] tweets_sorted = tweets.sort_values('topic', ascending=False) tweets_sorted.drop_duplicates(subset=['original_tweets']) rep_tweets = tweets_sorted['original_tweets'] rep_tweets = [*set(rep_tweets)] top_tweets.append(rep_tweets[:5]) # print('Topic ', i) # print(rep_tweets[:5]) return top_tweets def topic_summarization(topic_groups): tokenizer = AutoTokenizer.from_pretrained("Michau/t5-base-en-generate-headline") model = AutoModelForSeq2SeqLM.from_pretrained("Michau/t5-base-en-generate-headline") translator = Translator() headlines = [] for i in range(len(topic_groups)): tweets = " ".join(topic_groups[i]) # print(tweets) out = translator.translate(tweets, dest='en') text = out.text # print(tweets) max_len = 256 encoding = tokenizer.encode_plus(text, return_tensors = "pt") input_ids = encoding["input_ids"] attention_masks = encoding["attention_mask"] beam_outputs = model.generate( input_ids = input_ids, attention_mask = attention_masks, max_length = 64, num_beams = 3, early_stopping = True, ) result = tokenizer.decode(beam_outputs[0]) print(result) headlines += "Topic " + str(i) + " " + result return headlines def compute_coherence_value_bertopic(topic_model): topic_words = [[words for words, _ in topic_model.get_topic(topic)] for topic in range(len(set(topics))-1)] coherence_model = CoherenceModel(topics=topic_words, texts=df['lemma_tokens'], corpus=corpus, dictionary=id2word, coherence=coherence) coherence_score = coherence_model.get_coherence() return coherence_score def base_bertopic(): df['lemma_tokens_string'] = df['lemma_tokens'].apply(lambda x: ' '.join(x)) global id2word id2word = Dictionary(df['lemma_tokens']) global corpus corpus = [id2word.doc2bow(d) for d in df['lemma_tokens']] global umap_model umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=100) base_topic_model = BERTopic(umap_model=umap_model, language="english", calculate_probabilities=True) topics, probabilities = base_topic_model.fit_transform(df['lemma_tokens_string']) try: print(compute_coherence_value_bertopic(base_topic_model)) except: print('Unable to generate meaningful topics (Base BERTopic model)') def optimized_bertopic(): vectorizer_model = CountVectorizer(max_features=1_000, stop_words="english") optimized_topic_model = BERTopic(umap_model=umap_model, language="multilingual", n_gram_range=(1, 3), vectorizer_model=vectorizer_model, calculate_probabilities=True) topics, probabilities = optimized_topic_model.fit_transform(df['lemma_tokens_string']) try: print(compute_coherence_value_bertopic(optimized_topic_model)) except: print('Unable to generate meaningful topics, base BERTopic model if possible') rep_docs = optimized_topic_model.representative_docs_ global top_tweets top_tweets = [] for topic in rep_docs: if topic == -1: print('test') continue topic_docs = rep_docs.get(topic) tweets = [] for doc in topic_docs: index = df.isin([doc]).any(axis=1).idxmax() # print(index) tweets.append(df.loc[index, 'original_tweets']) print(tweets) top_tweets.append(tweets) global examples def main(dataset, model, progress=gr.Progress(track_tqdm=True)): global df examples = [ "katip,katipunan", "bgc,bonifacio global city", "pobla,poblacion", "cubao", "taft" ] keyword_list = dataset.split(',') if len(keyword_list) > 1: keywords = '(' + ' OR '.join(keyword_list) + ')' else: keywords = keyword_list[0] if dataset in examples: df = get_example(keywords) place_data = 'test' else: print(dataset) place_data = str(scrape(keyword_list)) print(df) if model == 'LDA': print('doing lda') top_tweets = full_lda(df) print('done lda') place_data = 'test' else: base_bertopic() optimized_bertopic() print('doing topic summarization') headlines = topic_summarization(top_tweets) headlines = '\n'.join(str(h) for h in headlines) return place_data, headlines iface = gr.Interface(fn=main, inputs=[gr.Dropdown(["katip,katipunan", "bgc,bonifacio global city", "cubao", "taft", "pobla,poblacion"], label="Dataset"), gr.Dropdown(["LDA", "BERTopic"], label="Model") ], # examples=examples, outputs=["text", "text"], enable_queue=True, debug=True, ) iface.launch(debug=True, enable_queue=True)