|
"""Generate a similarity matrix (doc-term score matrix) based on textacy.representation.Vectorizer. |
|
|
|
refer also to fast-scores fast_scores.py and gen_model.py (sklearn.feature_extraction.text.TfidfVectorizer). |
|
originally docterm_scores.py. |
|
""" |
|
from typing import Dict, Iterable, List, Optional, Union |
|
import numpy as np |
|
from itertools import chain |
|
from psutil import virtual_memory |
|
from more_itertools import ilen |
|
|
|
from textacy.representations import Vectorizer |
|
|
|
from logzero import logger |
|
|
|
|
|
from gradiobee.gen_model import gen_model |
|
|
|
|
|
|
|
def smatrix( |
|
doc1: Iterable[Iterable[str]], |
|
doc2: Iterable[Iterable[str]], |
|
model: Vectorizer = None, |
|
tf_type: str = 'linear', |
|
idf_type: Optional[str] = "smooth", |
|
|
|
dl_type: Optional[str] = None, |
|
norm: Optional[str] = "l2", |
|
min_df: Union[int, float] = 1, |
|
max_df: Union[int, float] = 1.0, |
|
max_n_terms: Optional[int] = None, |
|
vocabulary_terms: Optional[Union[Dict[str, int], Iterable[str]]] = None |
|
) -> np.ndarray: |
|
|
|
"""Generate a doc-term score matrix based on textacy.representation.Vectorizer. |
|
|
|
Args |
|
doc1: tokenized doc of n1 |
|
doc2: tokenized doc of n2 |
|
model: if None, generate one ad hoc from doc1 and doc2 ("lucene-style tfidf"). |
|
rest: refer to textacy.representation.Vectorizer |
|
Attributes |
|
vectorizer |
|
|
|
Returns |
|
n1 x n2 similarity matrix of float numbers |
|
""" |
|
|
|
try: |
|
for xelm in iter(doc1): |
|
for elm in iter(xelm): |
|
assert isinstance(elm, str) |
|
except AssertionError: |
|
raise AssertionError(" doc1 is not of the typing Iterable[Iterable[str]] ") |
|
except Exception as e: |
|
logger.error(e) |
|
raise |
|
try: |
|
for xelm in iter(doc2): |
|
for elm in iter(xelm): |
|
assert isinstance(elm, str) |
|
except AssertionError: |
|
raise AssertionError(" doc2 is not of the typing Iterable[Iterable[str]] ") |
|
except Exception as e: |
|
logger.error(e) |
|
raise |
|
|
|
if model is None: |
|
model = gen_model( |
|
[*chain(doc1, doc2)], |
|
tf_type=tf_type, |
|
idf_type=idf_type, |
|
dl_type=dl_type, |
|
norm=norm, |
|
min_df=min_df, |
|
max_df=max_df, |
|
max_n_terms=max_n_terms, |
|
vocabulary_terms=vocabulary_terms |
|
) |
|
|
|
smatrix.model = model |
|
|
|
|
|
|
|
|
|
dt1 = model.transform(doc1) |
|
dt2 = model.transform(doc2) |
|
|
|
|
|
require_ram = ilen(iter(doc1)) * ilen(iter(doc2)) * 8 |
|
if require_ram > virtual_memory().available: |
|
logger.warning("virtual_memory().available: %s", virtual_memory().available) |
|
logger.warning("memory required: %s", require_ram) |
|
|
|
if require_ram > virtual_memory().available * 10: |
|
logger.warning("You're likely to encounter memory problem, such as slowing down response and/or OOM.") |
|
|
|
|
|
return dt2.toarray().dot(dt1.toarray().T) |
|
|