Source code for melusine.summarizer.keywords_generator

import copy
import numpy as np
import pandas as pd
import scipy.sparse as sp
from collections import Counter
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import TfidfVectorizer
from melusine.utils.transformer_scheduler import TransformerScheduler
from melusine import config

keywords = config["words_list"]["keywords"]
stopwords = config["tokenizer"]["stopwords"] + list(
    config["token_flagger"]["token_flags"]["flag_name"]
)


[docs]class KeywordsGenerator(BaseEstimator, TransformerMixin):
    """Class to extract list of keywords from text.

    It is compatible with scikit-learn API (i.e. contains fit, transform
    methods).

    Parameters
    ----------

    max_tfidf_features : int, optional
        Size of vocabulary for tfidf.
        Default value, 10000.

    keywords : list, optional
        Keywords to extracted as priority.
        Default value, "keywords" list defined in conf file.

    stopwords : list, optional
        Stopwords not to be extracted.
        Default value, "names" and "stopwords" lists defined in conf file.

    resample : bool, optional
        True if dataset must be resampled according to class distribution,
        else False.
        Default value, True.

    n_jobs : int, optional
        Number of cores used for computation.
        Default value, 20.

    copy : bool, optional
        Make a copy of DataFrame.
        Default value, True.

    n_max_keywords : int, optional
        Maximum number of keywords to be returned.
        Default value, 6.

    n_min_keywords : int, optional
        Minimum number of keywords to be returned.
        Default value, 0.

    threshold_keywords : float, optional
        Minimum tf-idf score for word to be selected as keyword.
        Default value, 0.0.

    n_docs_in_class : int, optional
        Number of documents in each classes.
        Default value, 100.

    keywords_coef : int, optional
        Coefficient multiplied with the tf-idf scores of each keywords.
        Default value, 10.

    Attributes
    ----------
    max_tfidf_features, keywords, stopwords, resample, n_jobs, progress_bar,
    copy, n_max_keywords, n_min_keywords, threshold_keywords, n_docs_in_class,
    keywords_coef,

    tfidf_vectorizer : TfidfVectorizer instance from sklearn,

    dict_scores_ : dictionary,
        Tf-idf scores for each tokens.

    max_score_ : np.array,

    Examples
    --------
    >>> from melusine.summarizer.keywords_generator import KeywordsGenerator
    >>> keywords_generator = KeywordsGenerator()
    >>> keywords_generator.fit(X, y)
    >>> keywords_generator.transform(X)
    >>> print(X['keywords'])

    """

    def __init__(
        self,
        max_tfidf_features=10000,
        keywords=keywords,
        stopwords=stopwords,
        resample=False,
        n_jobs=1,
        progress_bar=False,
        copy=True,
        n_max_keywords=6,
        n_min_keywords=0,
        threshold_keywords=0.0,
        n_docs_in_class=100,
        keywords_coef=10,
    ):
        self.max_tfidf_features = max_tfidf_features
        self.tfidf_vectorizer = TfidfVectorizer(max_features=max_tfidf_features)
        self.keywords = keywords
        self.stopwords = stopwords
        self.resample = resample
        self.n_jobs = n_jobs
        self.progress_bar = progress_bar
        self.copy = copy
        self.n_max_keywords = n_max_keywords
        self.n_min_keywords = n_min_keywords
        self.threshold_keywords = threshold_keywords
        self.n_docs_in_class = n_docs_in_class
        self.keywords_coef = keywords_coef

[docs]    def fit(self, X, y=None):
        """Fit the weighted tf-idf model with input data.

        If resample attribute is True the dataset will be resampled according
        to class distribution.

        Parameters
        ----------
        X : pandas.DataFrame, shape (n_samples, n_features)
            X must contain ['tokens'] column.

        y : Ignored

        Returns
        -------
        self : object
            Returns the instance itself.
        """
        if isinstance(X, dict):
            raise TypeError(
                "You should not use fit on a dictionary object. Use a DataFrame"
            )

        if self.resample:
            X_resample = self.resample_docs(X, y)
        else:
            X_resample = X

        X_resample["tokens"] = X_resample["tokens"].apply(self._remove_stopwords)

        # fit tf-idf on resample data set
        tokens_joined = X_resample["tokens"].apply(lambda x: " ".join(x))
        self.tfidf_vectorizer.fit(tokens_joined)

        # modify the idf weights given frequency in the corpus
        idf_weights = self._add_tf_to_idf(X_resample)
        self.tfidf_vectorizer._tfidf._idf_diag = sp.spdiags(
            idf_weights, diags=0, m=len(idf_weights), n=len(idf_weights)
        )

        # return vetorizer with binary term frequency atribute
        self.dict_scores_ = dict(
            zip(
                self.tfidf_vectorizer.get_feature_names_out(),
                self.tfidf_vectorizer.idf_,
            )
        )
        self.max_score_ = np.max(self.tfidf_vectorizer.idf_)

        return self

[docs]    def transform(self, X):
        """Returns list of keywords in apparition order for each document
        with the weighted tf-idf already fitted.

        Parameters
        ----------
        X : pandas.DataFrame, shape (n_samples, n_features)
            X must contain ['tokens'] column.

        Returns
        -------
        X_new : pandas.DataFrame, shape (n_samples, n_components)
        """
        # Case input is a dict
        if isinstance(X, dict):
            if self.copy:
                X_ = copy.deepcopy(X)
            else:
                X_ = X

            apply_func = TransformerScheduler.apply_dict

        # Case input is a DataFrame
        else:
            if self.copy:
                X_ = X.copy()
            else:
                X_ = X

            apply_func = TransformerScheduler.apply_pandas_multiprocessing

        X_["keywords"] = apply_func(
            X_,
            self.get_keywords,
            args_=None,
            cols_=None,
            n_jobs=self.n_jobs,
            progress_bar=self.progress_bar,
        )

        return X_

[docs]    def get_keywords(self, row):
        """Returns list of keywords in apparition order with the
        weighted tf-idf already fitted.

        Parameters
        ----------
        row : row of pd.Dataframe, columns ['tokens']

        Returns
        -------
        list of strings
        """
        tokens = self._remove_stopwords(row["tokens"])
        tokens = [x for x in tokens if not x.isdigit()]
        scores = Counter({t: self.dict_scores_.get(t, 0) for t in tokens})
        n = sum(i > self.threshold_keywords for i in list(scores.values()))
        n = min(n, self.n_max_keywords)
        n = max(n, self.n_min_keywords)
        keywords = [x[0] for x in scores.most_common(n)]
        index_sorted = [(k, tokens.index(k)) for k in keywords if k in tokens]
        index_sorted = sorted(index_sorted, key=lambda x: x[1])
        keywords_sorted = [i[0] for i in index_sorted]

        return keywords_sorted

[docs]    def resample_docs(self, X, y=None):
        """Method for resampling documents according to class distribution."""
        X_ = X.copy()
        if y is not None:
            X_["label"] = y
        X_["split"] = 0
        for c in X_.label.unique():
            N_c = X_[X_["label"] == c].shape[0]
            I_c = np.random.randint(0, self.n_docs_in_class + 1, N_c)
            X_.loc[X_["label"] == c, "split"] = I_c

        X_resample = pd.DataFrame(
            X_[["label", "split", "tokens"]]
            .groupby(["label", "split"], as_index=False)["tokens"]
            .sum()
        )

        return X_resample

    def _remove_stopwords(self, tokens):
        """Method to filter stopwords from potential list of keywords."""
        return [t for t in tokens if t not in self.stopwords]

    def _add_tf_to_idf(self, X):
        """Returns the tf-idf weights of each tokens"""
        tokens_joined = X["tokens"].apply(lambda x: " ".join(x))
        X_vec = self.tfidf_vectorizer.transform(tokens_joined)
        feature_names = self.tfidf_vectorizer.get_feature_names_out()
        idf_weights = self._get_weights(X_vec.toarray(), self.keywords, feature_names)

        return idf_weights

    def _get_weights(self, X_vec, keywords_list, feature_names):
        """Put max weights for each word of redistributed mails."""
        max_ = np.max(X_vec, axis=0)
        mmax_ = np.max(max_)
        for k in keywords_list:
            if k in feature_names:
                max_[feature_names.index(k)] = mmax_ * self.keywords_coef

        return max_