ValadisCERTH commited on
Commit
afb5c93
·
1 Parent(s): 6655f4c

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +143 -0
app.py ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import spacy
2
+ import re
3
+
4
+ from datetime import datetime
5
+
6
+ # load the spacy model
7
+ spacy.cli.download("en_core_web_lg")
8
+ nlp = spacy.load("en_core_web_lg")
9
+
10
+ # Define a function to extract dates from text
11
+ def extract_dates(text):
12
+ """
13
+ Identify dates both in numeric and free-text from text, using date regex patterns and NER tag
14
+ """
15
+
16
+ # Define regex patterns for common date formats
17
+ # Regular expressions that include the \b word boundary character to ensure that the date pattern only matches if it is not part of a longer pattern that has already been matched
18
+ date_patterns = [
19
+ r'\b\d{1,2}[-/]\d{1,2}[-/]\d{2,4}\b', # Matches dates like "01/01/22" or "1-1-2022"
20
+ r'\b\d{1,2}[-/]\d{1,2}\b(?!\d)', # Matches dates like "01/01" or "1-1", but not "10-10-2022"
21
+ r'\b[A-Z][a-z]{2,8} \d{1,2},? \d{2,4}\b', # Matches dates like "January 1, 2022" or "Feb 28, 22"
22
+ r'\b\d{1,2} [A-Z][a-z]{2,8} \d{2,4}\b', # Matches dates like "1 January 2022" or "28 Feb 22"
23
+ r'\b[A-Z][a-z]{2,8} \d{2,4}\b', # Matches dates like "January 2022" or "Feb 22"
24
+ ]
25
+
26
+ # Find all matches for date patterns in the text
27
+ matches = []
28
+ for pattern in date_patterns:
29
+ for match in re.findall(pattern, text):
30
+
31
+ # Check if the match is part of a longer date pattern that has already been matched
32
+ if all(match not in m for m in matches):
33
+ matches.append(match)
34
+
35
+ # Use SpaCy to extract additional dates
36
+ doc = nlp(text)
37
+
38
+ for ent in doc.ents:
39
+ if ent.label_ == 'DATE':
40
+ date_str = ent.text
41
+
42
+ # Checks each SpaCy date reference against the matches list to ensure that it is not already included
43
+ if all(date_str not in m for m in matches):
44
+ matches.append(date_str)
45
+
46
+ # Remove duplicates and return the matches
47
+ return list(set(matches))
48
+
49
+
50
+ def convert_dates(date_list):
51
+ """
52
+ Assign to the identified formatted dates the proper date format and then, on the formatted dates, assign the relevant date tags (e.g. specify which is the day, the month, etc)
53
+ """
54
+
55
+ output_list = []
56
+ for date_str in date_list:
57
+ try:
58
+ dt = datetime.strptime(date_str, '%B %d, %Y')
59
+ output_list.append(f"day:{dt.day}, month:{dt.month}, year:{dt.year}")
60
+ except ValueError:
61
+
62
+ try:
63
+ dt = datetime.strptime(date_str, '%m-%d-%y')
64
+ output_list.append(f"day:{dt.day}, month:{dt.month}, year:{dt.year}")
65
+ except ValueError:
66
+
67
+ try:
68
+ dt = datetime.strptime(date_str, '%d/%m')
69
+ output_list.append(f"day:{dt.day}, month:{dt.month}")
70
+ except ValueError:
71
+
72
+ try:
73
+ dt = datetime.strptime(date_str, '%B %d')
74
+ output_list.append(f"day:{dt.day}, month:{dt.month}")
75
+ except ValueError:
76
+
77
+ try:
78
+ dt = datetime.strptime(date_str, '%b %d')
79
+ output_list.append(f"day:{dt.day}, month:{dt.month}")
80
+ except ValueError:
81
+
82
+ try:
83
+ dt = datetime.strptime(date_str, '%B %Y')
84
+ output_list.append(f"month:{dt.month}, year:{dt.year}")
85
+ except ValueError:
86
+
87
+ try:
88
+ dt = datetime.strptime(date_str, '%Y')
89
+ output_list.append(f"year:{dt.year}")
90
+ except ValueError:
91
+
92
+ try:
93
+ dt = datetime.strptime(date_str, '%d/%m/%y')
94
+ output_list.append(f"day:{dt.day}, month:{dt.month}, year:{dt.year}")
95
+ except ValueError:
96
+
97
+ try:
98
+ dt = datetime.strptime(date_str, '%Y %d %m')
99
+ output_list.append({'day': dt.day, 'month': dt.month, 'year': dt.year})
100
+ except ValueError:
101
+
102
+ try:
103
+ dt = datetime.strptime(date_str, '%B')
104
+ output_list.append(dt.strftime('month:%-m'))
105
+ except ValueError as e:
106
+ output_list.append(f'INVALID FORMAT: {date_str}')
107
+ return output_list
108
+
109
+
110
+
111
+ def dates_binding(text):
112
+
113
+ identified_dates = extract_dates(text)
114
+ formatted_dates = convert_dates(identified_dates)
115
+
116
+ return formatted_dates
117
+
118
+
119
+ from transformers import pipeline
120
+ import gradio as gr
121
+
122
+ title = "Natural Language module Demo for Dates"
123
+ description = "This is a simple demo just for demonstration purposes, so that Serco team might have the chance to validate the results of the Natural Language module concerning date identification, while in progress"
124
+
125
+ examples = [
126
+ ["Earthquake located in Ishkoshim, Tajikistan in 2018 05 09 with magnitude greater than 6.2"],
127
+ ["Give me all the earthquakes with magnitude above than 6 in the region of Athens for the month of January 2, 2023"],
128
+ ["Earthquake happened in Rome during 1986 with a magnitude of 6"],
129
+ ["I want all earthquakes larger than 5.0 that occurred in Rome during 3/5/20"],
130
+ ["I want all earthquakes happened in Greece 31/12"],
131
+ ["I want all earthquakes happened in Greece January 5"],
132
+ ["I want all earthquakes happened in Greece 3-4-20"]
133
+ ]
134
+
135
+ gr.Interface(
136
+ fn=dates_binding,
137
+ inputs="text",
138
+ outputs="text",
139
+ title=title,
140
+ description=description,
141
+ examples=examples,
142
+ enable_queue=True,
143
+ ).launch()