File size: 3,101 Bytes
22738ca
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File  : run.py
# @Author: nixin
# @Date  : 2021/11/26

import pandas as pd
from functions import *

from functools import partial
import multiprocessing as mp


df = pd.read_csv('/Users/nixin/PycharmProjects/PatentSolver_demonstrator/MCDA/data/results (18).csv')
print(df.columns)

patent_number =[]
for patent in df['patent_number']:
    patent_number.append(patent)

print(patent_number)

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
# ~~~ Parameters for data_patent_details file ~~~ #
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
path_to_data = "/Users/nixin/PycharmProjects/PatentSolver_demonstrator/MCDA/data/"  #### don't forget to change

## Create csv file to store the data_patent_details from the patent runs
#  (1) Specify column order of patents
#  (2) Create csv if it does not exist in the data_patent_details path
data_column_order = ['inventor_name',
                     'assignee_name_orig',
                     'assignee_name_current',
                     'pub_date',
                     'priority_date',
                     'grant_date',
                     'filing_date',
                     'forward_cite_no_family',
                     'forward_cite_yes_family',
                     'backward_cite_no_family',
                     'backward_cite_yes_family',
                     'patent',
                     'url',
                     'abstract_text']

if 'edison_patents.csv' in os.listdir(path_to_data):
    os.remove(path_to_data + 'edison_patents.csv')  # delete previous csv file
    with open(path_to_data + 'edison_patents.csv', 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(data_column_order)
else:
    with open(path_to_data + 'edison_patents.csv', 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(data_column_order)
#
#
########### Run pool process #############
if __name__ == "__main__":
    ## Create lock to prevent collisions when processes try to write on same file
    l = mp.Lock()

    ## Use a pool of workers where the number of processes is equal to
    ##   the number of cpus - 1
    with poolcontext(processes=mp.cpu_count() - 1, initializer=init, initargs=(l,)) as pool:
        pool.map(partial(single_process_scraper, path_to_data_file=path_to_data + 'edison_patents.csv',
                         data_column_order=data_column_order),
                 patent_number)

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
# ~~~ clean raw data_patent_details ~~~ #
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #

##read Google scrawer's results
table = pd.read_csv('/Users/nixin/PycharmProjects/PatentSolver_demonstrator/MCDA/data/edison_patents.csv')

# clean raw patent results
results = clean_patent(table)


# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
# ~~~ count number ~~~ #
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #

results = count_patent(results)
print(results.columns)
results.to_csv('/Users/nixin/PycharmProjects/PatentSolver_demonstrator/MCDA/data/cleaned_count_patents.csv', index=False)