nam194 commited on
Commit
4e53297
·
1 Parent(s): f91691d

Update parse_info.py

Browse files
Files changed (1) hide show
  1. parse_info.py +25 -9
parse_info.py CHANGED
@@ -1,11 +1,4 @@
1
- import string
2
- import chardet, string, gdown, re
3
- from pathlib import Path
4
- from nltk import everygrams
5
- from collections import Counter
6
- from typing import List, Optional
7
- from datetime import datetime
8
- from dateutil import parser, relativedelta
9
 
10
  punc = list(string.punctuation)
11
  def parse_string(inp: str, rep=" ", punc=punc, excp=[]) -> str:
@@ -90,4 +83,27 @@ def parse_designation(inp: List) -> str:
90
 
91
  def parse_email(inp: List) -> str:
92
  inp = list(set([parse_string(i, rep="", excp=["@","."]) for i in inp]))
93
- return " ".join(inp)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from imports import *
 
 
 
 
 
 
 
2
 
3
  punc = list(string.punctuation)
4
  def parse_string(inp: str, rep=" ", punc=punc, excp=[]) -> str:
 
83
 
84
  def parse_email(inp: List) -> str:
85
  inp = list(set([parse_string(i, rep="", excp=["@","."]) for i in inp]))
86
+ return " ".join(inp)
87
+
88
+ def decontracted(phrase) -> str:
89
+ phrase = re.sub(r"â€|™|“|”|;|ü|\xad|\xa0|\u200b|·|∙|�|●|�|§|•|!|▪|©|\?|\]|\[|\)|\(", "", phrase)
90
+ phrase = phrase.strip()
91
+ phrase = unicodedata.normalize("NFC", phrase)
92
+ if " " in phrase or " " in phrase: # check space character
93
+ phrase = phrase.replace(" ","_").replace(" ","_").replace(" ","").replace("_"," ")
94
+ tmp = phrase.split(" ")
95
+ check_parse = True
96
+ for i in tmp:
97
+ if len(i) > 1:
98
+ check_parse = False
99
+ break
100
+ if check_parse:
101
+ phrase = phrase.replace(" ","")
102
+ # phrase = phrase.replace(" "," ").replace(" "," ")
103
+ return phrase.replace("\n"," ")
104
+
105
+ def normalize_bbox(bbox, size): # must normalize bbox to [0;1000]
106
+ return [int(1000 * bbox[0] / size[0]),
107
+ int(1000 * bbox[1] / size[1]),
108
+ int(1000 * bbox[2] / size[0]),
109
+ int(1000 * bbox[3] / size[1])]