File size: 1,814 Bytes
ba7a003
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import sys
import os
from datetime import datetime
import pandas as pd
import contexttimer
from urllib.request import urlopen
import requests
from PIL import Image
import torch
from torchvision.transforms import functional as TF
from multiprocessing import Pool
from tqdm import tqdm
import logging
import sys
import numpy as np



from nltk.tag import CRFTagger
ct = CRFTagger()
ct.set_model_file('all_indo_man_tag_corpus_model.crf.tagger')

headers = {
    "User-Agent": "Googlebot-Image/1.0",  # Pretend to be googlebot
    "X-Forwarded-For": "64.18.15.200",
}

# Setup
logging.basicConfig(filename='download.log', filemode='w', level=logging.INFO)
requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning)

'''if len(sys.argv) != 3:
    print("Provide .tsv file name & output directory. e.g. python downloader.py Train-GCC-training.tsv training")
    exit(1)'''

# Load data
print(f'Starting to load at {datetime.now().isoformat(timespec="minutes")}')

with contexttimer.Timer(prefix="Loading from tsv"):
    df = pd.read_csv(sys.argv[1], delimiter='\t')
    df = df[["caption", "url"]]

def drop_no(text):
    try:
        if len(text)==0:
            return True
        elif len(text) > 96:
            return True
        text = text.split()
        result = ct.tag_sents([text])
        nnp_cnt = 0
        total = len(result[0])

        for x in result[0]:
            if x[1] == "NNP":
                nnp_cnt += 1      
        
        if (nnp_cnt/total)>=0.8:
            return True
        return False
    except Exception as e:
        print(e)
        return True
    
df["to_drop"]=df["caption"].apply(drop_no)
df = df[df["to_drop"]==False]
df = df.drop("to_drop",axis=1)

df["index_row"] = df.index

df.to_csv(sys.argv[2], sep='\t',index=False)