File size: 1,912 Bytes
97208ad
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import re
import spacy
from collections import Counter

nlp = spacy.load('en_core_web_sm')

def preprocess_text(text):
    text = re.sub(r'[\n\r\t]', ' ', text)
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    doc = nlp(text)
    
    result = {
        "original_text": text,
        "sentences": [],
        "pos_groups": {},
        "named_entities": [],
        "dependencies": [],
        "token_offsets": [],
        "word_frequency": {},
        "sentence_lengths": [],
        "pos_counts": {}
    }
    
    pos_groups = {
        "NOUN": [], "VERB": [], "ADJ": [], "ADV": [], "PROPN": [],
        "DET": [], "AUX": [], "PRON": [], "ADP": [], "NUM": [],
        "PART": [], "PUNCT": [], "INTJ": [], "X": []
    }
    
    all_tokens = []
    
    for sent in doc.sents:
        result["sentences"].append(sent.text)
        result["sentence_lengths"].append(len(sent))
        
        for token in sent:
            pos = token.pos_
            all_tokens.append(token.text)
            
            if pos in pos_groups:
                pos_groups[pos].append(token.text)
                
            result["dependencies"].append({
                "token": token.text,
                "dep": token.dep_,
                "head": token.head.text
            })
            result["token_offsets"].append({
                "token": token.text,
                "start": token.idx,
                "end": token.idx + len(token.text)
            })
    
    result["pos_groups"] = pos_groups
    result["named_entities"] = [{"text": ent.text, "label": ent.label_} for ent in doc.ents]
    result["word_frequency"] = dict(Counter(all_tokens))
    result["pos_counts"] = dict(Counter([token.pos_ for token in doc]))

    result["names"] = [ent.text for ent in doc.ents if ent.label_ == "PERSON"]
    result["locations"] = [ent.text for ent in doc.ents if ent.label_ in {"GPE", "LOC"}]
    
    return result