File size: 31,275 Bytes
d916065
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
#
# Natural Language Toolkit: Sentiment Analyzer
#
# Copyright (C) 2001-2023 NLTK Project
# Author: Pierpaolo Pantone <[email protected]>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT

"""

Utility methods for Sentiment Analysis.

"""

import codecs
import csv
import json
import pickle
import random
import re
import sys
import time
from copy import deepcopy

import nltk
from nltk.corpus import CategorizedPlaintextCorpusReader
from nltk.data import load
from nltk.tokenize.casual import EMOTICON_RE

# ////////////////////////////////////////////////////////////
# { Regular expressions
# ////////////////////////////////////////////////////////////

# Regular expression for negation by Christopher Potts
NEGATION = r"""

    (?:

        ^(?:never|no|nothing|nowhere|noone|none|not|

            havent|hasnt|hadnt|cant|couldnt|shouldnt|

            wont|wouldnt|dont|doesnt|didnt|isnt|arent|aint

        )$

    )

    |

    n't"""

NEGATION_RE = re.compile(NEGATION, re.VERBOSE)

CLAUSE_PUNCT = r"^[.:;!?]$"
CLAUSE_PUNCT_RE = re.compile(CLAUSE_PUNCT)

# Happy and sad emoticons

HAPPY = {
    ":-)",
    ":)",
    ";)",
    ":o)",
    ":]",
    ":3",
    ":c)",
    ":>",
    "=]",
    "8)",
    "=)",
    ":}",
    ":^)",
    ":-D",
    ":D",
    "8-D",
    "8D",
    "x-D",
    "xD",
    "X-D",
    "XD",
    "=-D",
    "=D",
    "=-3",
    "=3",
    ":-))",
    ":'-)",
    ":')",
    ":*",
    ":^*",
    ">:P",
    ":-P",
    ":P",
    "X-P",
    "x-p",
    "xp",
    "XP",
    ":-p",
    ":p",
    "=p",
    ":-b",
    ":b",
    ">:)",
    ">;)",
    ">:-)",
    "<3",
}

SAD = {
    ":L",
    ":-/",
    ">:/",
    ":S",
    ">:[",
    ":@",
    ":-(",
    ":[",
    ":-||",
    "=L",
    ":<",
    ":-[",
    ":-<",
    "=\\",
    "=/",
    ">:(",
    ":(",
    ">.<",
    ":'-(",
    ":'(",
    ":\\",
    ":-c",
    ":c",
    ":{",
    ">:\\",
    ";(",
}


def timer(method):
    """

    A timer decorator to measure execution performance of methods.

    """

    def timed(*args, **kw):
        start = time.time()
        result = method(*args, **kw)
        end = time.time()
        tot_time = end - start
        hours = tot_time // 3600
        mins = tot_time // 60 % 60
        # in Python 2.x round() will return a float, so we convert it to int
        secs = int(round(tot_time % 60))
        if hours == 0 and mins == 0 and secs < 10:
            print(f"[TIMER] {method.__name__}(): {method.__name__:.3f} seconds")
        else:
            print(f"[TIMER] {method.__name__}(): {hours}h {mins}m {secs}s")
        return result

    return timed


# ////////////////////////////////////////////////////////////
# { Feature extractor functions
# ////////////////////////////////////////////////////////////
"""

Feature extractor functions are declared outside the SentimentAnalyzer class.

Users should have the possibility to create their own feature extractors

without modifying SentimentAnalyzer.

"""


def extract_unigram_feats(document, unigrams, handle_negation=False):
    """

    Populate a dictionary of unigram features, reflecting the presence/absence in

    the document of each of the tokens in `unigrams`.



    :param document: a list of words/tokens.

    :param unigrams: a list of words/tokens whose presence/absence has to be

        checked in `document`.

    :param handle_negation: if `handle_negation == True` apply `mark_negation`

        method to `document` before checking for unigram presence/absence.

    :return: a dictionary of unigram features {unigram : boolean}.



    >>> words = ['ice', 'police', 'riot']

    >>> document = 'ice is melting due to global warming'.split()

    >>> sorted(extract_unigram_feats(document, words).items())

    [('contains(ice)', True), ('contains(police)', False), ('contains(riot)', False)]

    """
    features = {}
    if handle_negation:
        document = mark_negation(document)
    for word in unigrams:
        features[f"contains({word})"] = word in set(document)
    return features


def extract_bigram_feats(document, bigrams):
    """

    Populate a dictionary of bigram features, reflecting the presence/absence in

    the document of each of the tokens in `bigrams`. This extractor function only

    considers contiguous bigrams obtained by `nltk.bigrams`.



    :param document: a list of words/tokens.

    :param unigrams: a list of bigrams whose presence/absence has to be

        checked in `document`.

    :return: a dictionary of bigram features {bigram : boolean}.



    >>> bigrams = [('global', 'warming'), ('police', 'prevented'), ('love', 'you')]

    >>> document = 'ice is melting due to global warming'.split()

    >>> sorted(extract_bigram_feats(document, bigrams).items()) # doctest: +NORMALIZE_WHITESPACE

    [('contains(global - warming)', True), ('contains(love - you)', False),

    ('contains(police - prevented)', False)]

    """
    features = {}
    for bigr in bigrams:
        features[f"contains({bigr[0]} - {bigr[1]})"] = bigr in nltk.bigrams(document)
    return features


# ////////////////////////////////////////////////////////////
# { Helper Functions
# ////////////////////////////////////////////////////////////


def mark_negation(document, double_neg_flip=False, shallow=False):
    """

    Append _NEG suffix to words that appear in the scope between a negation

    and a punctuation mark.



    :param document: a list of words/tokens, or a tuple (words, label).

    :param shallow: if True, the method will modify the original document in place.

    :param double_neg_flip: if True, double negation is considered affirmation

        (we activate/deactivate negation scope every time we find a negation).

    :return: if `shallow == True` the method will modify the original document

        and return it. If `shallow == False` the method will return a modified

        document, leaving the original unmodified.



    >>> sent = "I didn't like this movie . It was bad .".split()

    >>> mark_negation(sent)

    ['I', "didn't", 'like_NEG', 'this_NEG', 'movie_NEG', '.', 'It', 'was', 'bad', '.']

    """
    if not shallow:
        document = deepcopy(document)
    # check if the document is labeled. If so, do not consider the label.
    labeled = document and isinstance(document[0], (tuple, list))
    if labeled:
        doc = document[0]
    else:
        doc = document
    neg_scope = False
    for i, word in enumerate(doc):
        if NEGATION_RE.search(word):
            if not neg_scope or (neg_scope and double_neg_flip):
                neg_scope = not neg_scope
                continue
            else:
                doc[i] += "_NEG"
        elif neg_scope and CLAUSE_PUNCT_RE.search(word):
            neg_scope = not neg_scope
        elif neg_scope and not CLAUSE_PUNCT_RE.search(word):
            doc[i] += "_NEG"

    return document


def output_markdown(filename, **kwargs):
    """

    Write the output of an analysis to a file.

    """
    with codecs.open(filename, "at") as outfile:
        text = "\n*** \n\n"
        text += "{} \n\n".format(time.strftime("%d/%m/%Y, %H:%M"))
        for k in sorted(kwargs):
            if isinstance(kwargs[k], dict):
                dictionary = kwargs[k]
                text += f"  - **{k}:**\n"
                for entry in sorted(dictionary):
                    text += f"    - {entry}: {dictionary[entry]} \n"
            elif isinstance(kwargs[k], list):
                text += f"  - **{k}:**\n"
                for entry in kwargs[k]:
                    text += f"    - {entry}\n"
            else:
                text += f"  - **{k}:** {kwargs[k]} \n"
        outfile.write(text)


def split_train_test(all_instances, n=None):
    """

    Randomly split `n` instances of the dataset into train and test sets.



    :param all_instances: a list of instances (e.g. documents) that will be split.

    :param n: the number of instances to consider (in case we want to use only a

        subset).

    :return: two lists of instances. Train set is 8/10 of the total and test set

        is 2/10 of the total.

    """
    random.seed(12345)
    random.shuffle(all_instances)
    if not n or n > len(all_instances):
        n = len(all_instances)
    train_set = all_instances[: int(0.8 * n)]
    test_set = all_instances[int(0.8 * n) : n]

    return train_set, test_set


def _show_plot(x_values, y_values, x_labels=None, y_labels=None):
    try:
        import matplotlib.pyplot as plt
    except ImportError as e:
        raise ImportError(
            "The plot function requires matplotlib to be installed."
            "See https://matplotlib.org/"
        ) from e

    plt.locator_params(axis="y", nbins=3)
    axes = plt.axes()
    axes.yaxis.grid()
    plt.plot(x_values, y_values, "ro", color="red")
    plt.ylim(ymin=-1.2, ymax=1.2)
    plt.tight_layout(pad=5)
    if x_labels:
        plt.xticks(x_values, x_labels, rotation="vertical")
    if y_labels:
        plt.yticks([-1, 0, 1], y_labels, rotation="horizontal")
    # Pad margins so that markers are not clipped by the axes
    plt.margins(0.2)
    plt.show()


# ////////////////////////////////////////////////////////////
# { Parsing and conversion functions
# ////////////////////////////////////////////////////////////


def json2csv_preprocess(

    json_file,

    outfile,

    fields,

    encoding="utf8",

    errors="replace",

    gzip_compress=False,

    skip_retweets=True,

    skip_tongue_tweets=True,

    skip_ambiguous_tweets=True,

    strip_off_emoticons=True,

    remove_duplicates=True,

    limit=None,

):
    """

    Convert json file to csv file, preprocessing each row to obtain a suitable

    dataset for tweets Semantic Analysis.



    :param json_file: the original json file containing tweets.

    :param outfile: the output csv filename.

    :param fields: a list of fields that will be extracted from the json file and

        kept in the output csv file.

    :param encoding: the encoding of the files.

    :param errors: the error handling strategy for the output writer.

    :param gzip_compress: if True, create a compressed GZIP file.



    :param skip_retweets: if True, remove retweets.

    :param skip_tongue_tweets: if True, remove tweets containing ":P" and ":-P"

        emoticons.

    :param skip_ambiguous_tweets: if True, remove tweets containing both happy

        and sad emoticons.

    :param strip_off_emoticons: if True, strip off emoticons from all tweets.

    :param remove_duplicates: if True, remove tweets appearing more than once.

    :param limit: an integer to set the number of tweets to convert. After the

        limit is reached the conversion will stop. It can be useful to create

        subsets of the original tweets json data.

    """
    with codecs.open(json_file, encoding=encoding) as fp:
        (writer, outf) = _outf_writer(outfile, encoding, errors, gzip_compress)
        # write the list of fields as header
        writer.writerow(fields)

        if remove_duplicates == True:
            tweets_cache = []
        i = 0
        for line in fp:
            tweet = json.loads(line)
            row = extract_fields(tweet, fields)
            try:
                text = row[fields.index("text")]
                # Remove retweets
                if skip_retweets == True:
                    if re.search(r"\bRT\b", text):
                        continue
                # Remove tweets containing ":P" and ":-P" emoticons
                if skip_tongue_tweets == True:
                    if re.search(r"\:\-?P\b", text):
                        continue
                # Remove tweets containing both happy and sad emoticons
                if skip_ambiguous_tweets == True:
                    all_emoticons = EMOTICON_RE.findall(text)
                    if all_emoticons:
                        if (set(all_emoticons) & HAPPY) and (set(all_emoticons) & SAD):
                            continue
                # Strip off emoticons from all tweets
                if strip_off_emoticons == True:
                    row[fields.index("text")] = re.sub(
                        r"(?!\n)\s+", " ", EMOTICON_RE.sub("", text)
                    )
                # Remove duplicate tweets
                if remove_duplicates == True:
                    if row[fields.index("text")] in tweets_cache:
                        continue
                    else:
                        tweets_cache.append(row[fields.index("text")])
            except ValueError:
                pass
            writer.writerow(row)
            i += 1
            if limit and i >= limit:
                break
        outf.close()


def parse_tweets_set(

    filename, label, word_tokenizer=None, sent_tokenizer=None, skip_header=True

):
    """

    Parse csv file containing tweets and output data a list of (text, label) tuples.



    :param filename: the input csv filename.

    :param label: the label to be appended to each tweet contained in the csv file.

    :param word_tokenizer: the tokenizer instance that will be used to tokenize

        each sentence into tokens (e.g. WordPunctTokenizer() or BlanklineTokenizer()).

        If no word_tokenizer is specified, tweets will not be tokenized.

    :param sent_tokenizer: the tokenizer that will be used to split each tweet into

        sentences.

    :param skip_header: if True, skip the first line of the csv file (which usually

        contains headers).



    :return: a list of (text, label) tuples.

    """
    tweets = []
    if not sent_tokenizer:
        sent_tokenizer = load("tokenizers/punkt/english.pickle")

    with codecs.open(filename, "rt") as csvfile:
        reader = csv.reader(csvfile)
        if skip_header == True:
            next(reader, None)  # skip the header
        i = 0
        for tweet_id, text in reader:
            # text = text[1]
            i += 1
            sys.stdout.write(f"Loaded {i} tweets\r")
            # Apply sentence and word tokenizer to text
            if word_tokenizer:
                tweet = [
                    w
                    for sent in sent_tokenizer.tokenize(text)
                    for w in word_tokenizer.tokenize(sent)
                ]
            else:
                tweet = text
            tweets.append((tweet, label))

    print(f"Loaded {i} tweets")
    return tweets


# ////////////////////////////////////////////////////////////
# { Demos
# ////////////////////////////////////////////////////////////


def demo_tweets(trainer, n_instances=None, output=None):
    """

    Train and test Naive Bayes classifier on 10000 tweets, tokenized using

    TweetTokenizer.

    Features are composed of:



    - 1000 most frequent unigrams

    - 100 top bigrams (using BigramAssocMeasures.pmi)



    :param trainer: `train` method of a classifier.

    :param n_instances: the number of total tweets that have to be used for

        training and testing. Tweets will be equally split between positive and

        negative.

    :param output: the output file where results have to be reported.

    """
    from nltk.corpus import stopwords, twitter_samples
    from nltk.sentiment import SentimentAnalyzer
    from nltk.tokenize import TweetTokenizer

    # Different customizations for the TweetTokenizer
    tokenizer = TweetTokenizer(preserve_case=False)
    # tokenizer = TweetTokenizer(preserve_case=True, strip_handles=True)
    # tokenizer = TweetTokenizer(reduce_len=True, strip_handles=True)

    if n_instances is not None:
        n_instances = int(n_instances / 2)

    fields = ["id", "text"]
    positive_json = twitter_samples.abspath("positive_tweets.json")
    positive_csv = "positive_tweets.csv"
    json2csv_preprocess(positive_json, positive_csv, fields, limit=n_instances)

    negative_json = twitter_samples.abspath("negative_tweets.json")
    negative_csv = "negative_tweets.csv"
    json2csv_preprocess(negative_json, negative_csv, fields, limit=n_instances)

    neg_docs = parse_tweets_set(negative_csv, label="neg", word_tokenizer=tokenizer)
    pos_docs = parse_tweets_set(positive_csv, label="pos", word_tokenizer=tokenizer)

    # We separately split subjective and objective instances to keep a balanced
    # uniform class distribution in both train and test sets.
    train_pos_docs, test_pos_docs = split_train_test(pos_docs)
    train_neg_docs, test_neg_docs = split_train_test(neg_docs)

    training_tweets = train_pos_docs + train_neg_docs
    testing_tweets = test_pos_docs + test_neg_docs

    sentim_analyzer = SentimentAnalyzer()
    # stopwords = stopwords.words('english')
    # all_words = [word for word in sentim_analyzer.all_words(training_tweets) if word.lower() not in stopwords]
    all_words = [word for word in sentim_analyzer.all_words(training_tweets)]

    # Add simple unigram word features
    unigram_feats = sentim_analyzer.unigram_word_feats(all_words, top_n=1000)
    sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)

    # Add bigram collocation features
    bigram_collocs_feats = sentim_analyzer.bigram_collocation_feats(
        [tweet[0] for tweet in training_tweets], top_n=100, min_freq=12
    )
    sentim_analyzer.add_feat_extractor(
        extract_bigram_feats, bigrams=bigram_collocs_feats
    )

    training_set = sentim_analyzer.apply_features(training_tweets)
    test_set = sentim_analyzer.apply_features(testing_tweets)

    classifier = sentim_analyzer.train(trainer, training_set)
    # classifier = sentim_analyzer.train(trainer, training_set, max_iter=4)
    try:
        classifier.show_most_informative_features()
    except AttributeError:
        print(
            "Your classifier does not provide a show_most_informative_features() method."
        )
    results = sentim_analyzer.evaluate(test_set)

    if output:
        extr = [f.__name__ for f in sentim_analyzer.feat_extractors]
        output_markdown(
            output,
            Dataset="labeled_tweets",
            Classifier=type(classifier).__name__,
            Tokenizer=tokenizer.__class__.__name__,
            Feats=extr,
            Results=results,
            Instances=n_instances,
        )


def demo_movie_reviews(trainer, n_instances=None, output=None):
    """

    Train classifier on all instances of the Movie Reviews dataset.

    The corpus has been preprocessed using the default sentence tokenizer and

    WordPunctTokenizer.

    Features are composed of:



    - most frequent unigrams



    :param trainer: `train` method of a classifier.

    :param n_instances: the number of total reviews that have to be used for

        training and testing. Reviews will be equally split between positive and

        negative.

    :param output: the output file where results have to be reported.

    """
    from nltk.corpus import movie_reviews
    from nltk.sentiment import SentimentAnalyzer

    if n_instances is not None:
        n_instances = int(n_instances / 2)

    pos_docs = [
        (list(movie_reviews.words(pos_id)), "pos")
        for pos_id in movie_reviews.fileids("pos")[:n_instances]
    ]
    neg_docs = [
        (list(movie_reviews.words(neg_id)), "neg")
        for neg_id in movie_reviews.fileids("neg")[:n_instances]
    ]
    # We separately split positive and negative instances to keep a balanced
    # uniform class distribution in both train and test sets.
    train_pos_docs, test_pos_docs = split_train_test(pos_docs)
    train_neg_docs, test_neg_docs = split_train_test(neg_docs)

    training_docs = train_pos_docs + train_neg_docs
    testing_docs = test_pos_docs + test_neg_docs

    sentim_analyzer = SentimentAnalyzer()
    all_words = sentim_analyzer.all_words(training_docs)

    # Add simple unigram word features
    unigram_feats = sentim_analyzer.unigram_word_feats(all_words, min_freq=4)
    sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)
    # Apply features to obtain a feature-value representation of our datasets
    training_set = sentim_analyzer.apply_features(training_docs)
    test_set = sentim_analyzer.apply_features(testing_docs)

    classifier = sentim_analyzer.train(trainer, training_set)
    try:
        classifier.show_most_informative_features()
    except AttributeError:
        print(
            "Your classifier does not provide a show_most_informative_features() method."
        )
    results = sentim_analyzer.evaluate(test_set)

    if output:
        extr = [f.__name__ for f in sentim_analyzer.feat_extractors]
        output_markdown(
            output,
            Dataset="Movie_reviews",
            Classifier=type(classifier).__name__,
            Tokenizer="WordPunctTokenizer",
            Feats=extr,
            Results=results,
            Instances=n_instances,
        )


def demo_subjectivity(trainer, save_analyzer=False, n_instances=None, output=None):
    """

    Train and test a classifier on instances of the Subjective Dataset by Pang and

    Lee. The dataset is made of 5000 subjective and 5000 objective sentences.

    All tokens (words and punctuation marks) are separated by a whitespace, so

    we use the basic WhitespaceTokenizer to parse the data.



    :param trainer: `train` method of a classifier.

    :param save_analyzer: if `True`, store the SentimentAnalyzer in a pickle file.

    :param n_instances: the number of total sentences that have to be used for

        training and testing. Sentences will be equally split between positive

        and negative.

    :param output: the output file where results have to be reported.

    """
    from nltk.corpus import subjectivity
    from nltk.sentiment import SentimentAnalyzer

    if n_instances is not None:
        n_instances = int(n_instances / 2)

    subj_docs = [
        (sent, "subj") for sent in subjectivity.sents(categories="subj")[:n_instances]
    ]
    obj_docs = [
        (sent, "obj") for sent in subjectivity.sents(categories="obj")[:n_instances]
    ]

    # We separately split subjective and objective instances to keep a balanced
    # uniform class distribution in both train and test sets.
    train_subj_docs, test_subj_docs = split_train_test(subj_docs)
    train_obj_docs, test_obj_docs = split_train_test(obj_docs)

    training_docs = train_subj_docs + train_obj_docs
    testing_docs = test_subj_docs + test_obj_docs

    sentim_analyzer = SentimentAnalyzer()
    all_words_neg = sentim_analyzer.all_words(
        [mark_negation(doc) for doc in training_docs]
    )

    # Add simple unigram word features handling negation
    unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg, min_freq=4)
    sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)

    # Apply features to obtain a feature-value representation of our datasets
    training_set = sentim_analyzer.apply_features(training_docs)
    test_set = sentim_analyzer.apply_features(testing_docs)

    classifier = sentim_analyzer.train(trainer, training_set)
    try:
        classifier.show_most_informative_features()
    except AttributeError:
        print(
            "Your classifier does not provide a show_most_informative_features() method."
        )
    results = sentim_analyzer.evaluate(test_set)

    if save_analyzer == True:
        sentim_analyzer.save_file(sentim_analyzer, "sa_subjectivity.pickle")

    if output:
        extr = [f.__name__ for f in sentim_analyzer.feat_extractors]
        output_markdown(
            output,
            Dataset="subjectivity",
            Classifier=type(classifier).__name__,
            Tokenizer="WhitespaceTokenizer",
            Feats=extr,
            Instances=n_instances,
            Results=results,
        )

    return sentim_analyzer


def demo_sent_subjectivity(text):
    """

    Classify a single sentence as subjective or objective using a stored

    SentimentAnalyzer.



    :param text: a sentence whose subjectivity has to be classified.

    """
    from nltk.classify import NaiveBayesClassifier
    from nltk.tokenize import regexp

    word_tokenizer = regexp.WhitespaceTokenizer()
    try:
        sentim_analyzer = load("sa_subjectivity.pickle")
    except LookupError:
        print("Cannot find the sentiment analyzer you want to load.")
        print("Training a new one using NaiveBayesClassifier.")
        sentim_analyzer = demo_subjectivity(NaiveBayesClassifier.train, True)

    # Tokenize and convert to lower case
    tokenized_text = [word.lower() for word in word_tokenizer.tokenize(text)]
    print(sentim_analyzer.classify(tokenized_text))


def demo_liu_hu_lexicon(sentence, plot=False):
    """

    Basic example of sentiment classification using Liu and Hu opinion lexicon.

    This function simply counts the number of positive, negative and neutral words

    in the sentence and classifies it depending on which polarity is more represented.

    Words that do not appear in the lexicon are considered as neutral.



    :param sentence: a sentence whose polarity has to be classified.

    :param plot: if True, plot a visual representation of the sentence polarity.

    """
    from nltk.corpus import opinion_lexicon
    from nltk.tokenize import treebank

    tokenizer = treebank.TreebankWordTokenizer()
    pos_words = 0
    neg_words = 0
    tokenized_sent = [word.lower() for word in tokenizer.tokenize(sentence)]

    x = list(range(len(tokenized_sent)))  # x axis for the plot
    y = []

    for word in tokenized_sent:
        if word in opinion_lexicon.positive():
            pos_words += 1
            y.append(1)  # positive
        elif word in opinion_lexicon.negative():
            neg_words += 1
            y.append(-1)  # negative
        else:
            y.append(0)  # neutral

    if pos_words > neg_words:
        print("Positive")
    elif pos_words < neg_words:
        print("Negative")
    elif pos_words == neg_words:
        print("Neutral")

    if plot == True:
        _show_plot(
            x, y, x_labels=tokenized_sent, y_labels=["Negative", "Neutral", "Positive"]
        )


def demo_vader_instance(text):
    """

    Output polarity scores for a text using Vader approach.



    :param text: a text whose polarity has to be evaluated.

    """
    from nltk.sentiment import SentimentIntensityAnalyzer

    vader_analyzer = SentimentIntensityAnalyzer()
    print(vader_analyzer.polarity_scores(text))


def demo_vader_tweets(n_instances=None, output=None):
    """

    Classify 10000 positive and negative tweets using Vader approach.



    :param n_instances: the number of total tweets that have to be classified.

    :param output: the output file where results have to be reported.

    """
    from collections import defaultdict

    from nltk.corpus import twitter_samples
    from nltk.metrics import accuracy as eval_accuracy
    from nltk.metrics import f_measure as eval_f_measure
    from nltk.metrics import precision as eval_precision
    from nltk.metrics import recall as eval_recall
    from nltk.sentiment import SentimentIntensityAnalyzer

    if n_instances is not None:
        n_instances = int(n_instances / 2)

    fields = ["id", "text"]
    positive_json = twitter_samples.abspath("positive_tweets.json")
    positive_csv = "positive_tweets.csv"
    json2csv_preprocess(
        positive_json,
        positive_csv,
        fields,
        strip_off_emoticons=False,
        limit=n_instances,
    )

    negative_json = twitter_samples.abspath("negative_tweets.json")
    negative_csv = "negative_tweets.csv"
    json2csv_preprocess(
        negative_json,
        negative_csv,
        fields,
        strip_off_emoticons=False,
        limit=n_instances,
    )

    pos_docs = parse_tweets_set(positive_csv, label="pos")
    neg_docs = parse_tweets_set(negative_csv, label="neg")

    # We separately split subjective and objective instances to keep a balanced
    # uniform class distribution in both train and test sets.
    train_pos_docs, test_pos_docs = split_train_test(pos_docs)
    train_neg_docs, test_neg_docs = split_train_test(neg_docs)

    training_tweets = train_pos_docs + train_neg_docs
    testing_tweets = test_pos_docs + test_neg_docs

    vader_analyzer = SentimentIntensityAnalyzer()

    gold_results = defaultdict(set)
    test_results = defaultdict(set)
    acc_gold_results = []
    acc_test_results = []
    labels = set()
    num = 0
    for i, (text, label) in enumerate(testing_tweets):
        labels.add(label)
        gold_results[label].add(i)
        acc_gold_results.append(label)
        score = vader_analyzer.polarity_scores(text)["compound"]
        if score > 0:
            observed = "pos"
        else:
            observed = "neg"
        num += 1
        acc_test_results.append(observed)
        test_results[observed].add(i)
    metrics_results = {}
    for label in labels:
        accuracy_score = eval_accuracy(acc_gold_results, acc_test_results)
        metrics_results["Accuracy"] = accuracy_score
        precision_score = eval_precision(gold_results[label], test_results[label])
        metrics_results[f"Precision [{label}]"] = precision_score
        recall_score = eval_recall(gold_results[label], test_results[label])
        metrics_results[f"Recall [{label}]"] = recall_score
        f_measure_score = eval_f_measure(gold_results[label], test_results[label])
        metrics_results[f"F-measure [{label}]"] = f_measure_score

    for result in sorted(metrics_results):
        print(f"{result}: {metrics_results[result]}")

    if output:
        output_markdown(
            output,
            Approach="Vader",
            Dataset="labeled_tweets",
            Instances=n_instances,
            Results=metrics_results,
        )


if __name__ == "__main__":
    from sklearn.svm import LinearSVC

    from nltk.classify import MaxentClassifier, NaiveBayesClassifier
    from nltk.classify.scikitlearn import SklearnClassifier
    from nltk.twitter.common import _outf_writer, extract_fields

    naive_bayes = NaiveBayesClassifier.train
    svm = SklearnClassifier(LinearSVC()).train
    maxent = MaxentClassifier.train

    demo_tweets(naive_bayes)
    # demo_movie_reviews(svm)
    # demo_subjectivity(svm)
    # demo_sent_subjectivity("she's an artist , but hasn't picked up a brush in a year . ")
    # demo_liu_hu_lexicon("This movie was actually neither that funny, nor super witty.", plot=True)
    # demo_vader_instance("This movie was actually neither that funny, nor super witty.")
    # demo_vader_tweets()