Source code for melusine.nlp_tools.embedding

import logging
from gensim.models import Word2Vec
from gensim.models.keyedvectors import Vocab, KeyedVectors
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer


logger = logging.getLogger(__name__)


[docs]class Embedding:
    """Class to train embeddings with Word2Vec algorithm.
    Attributes
    ----------
    word2id: dict,
        Word vocabulary (key: word, value: word_index.
    embedding : Gensim KeyedVector Instance,
        Gensim KeyedVector Instance relative to the specific trained or imported embedding.
    method : str,
        One of the following :
            - "word2vec_sg" : Trains a Word2Vec Embedding using the Skip-Gram method (usually takes a long time).
            - "word2vec_cbow" : Trains a Word2Vec Embedding using the Continuous Bag-Of-Words method.
            - "lsa_docterm" : Trains an Embedding by using an SVD on a Document-Term Matrix.
            - "lsa_tfidf" : Trains an Embedding by using an SVD on a TF-IDFized Document-Term Matrix.
    train_params : dict,
        Additional parameters for the embedding training. Check the following documentation :
            - gensim.models.Word2Vec for Word2Vec Embeddings
            - sklearn.decomposition.TruncatedSVD for LSA Embeddings
        If left untouched, the default training values will be kept from the aforementioned packages.
    Examples
    --------
    >>> from melusine.nlp_tools.embedding import Embedding
    >>> embedding = Embedding()
    >>> embedding.train(X)  # noqa
    >>> embedding.save(filepath)  # noqa
    >>> embedding = Embedding().load(filepath)  # noqa
    """

    def __init__(
        self,
        tokens_column=None,
        workers=40,
        random_seed=42,
        iter=15,
        size=300,
        method="word2vec_cbow",
        min_count=100,
    ):
        """
        Parameters :
        ----------
        workers : int,
            Number of CPUs to use for the embedding training process (default=40).
        random_seed : int,
            Seed for reproducibility (default=42).
        iter : int,
            Number of epochs (default=15). Used for Word2Vec and GloVe only.
        size : int,
            Desired embedding size (default=300).
        window : int,
            If Word2Vec, window used to find center-context word relationships.
            If GloVe, window used to compute the co-occurence matrix.
        min_count : int,
            Minimum number of appeareance of a token in the corpus for it to be kept in the vocabulary (default=100).
        stop_removal : bool,
            If True, removes stopwords in the Streamer process (default=True).
        method : str,
            One of the following :
                - "word2vec_sg" : Trains a Word2Vec Embedding using the Skip-Gram method and Negative-Sampling.
                - "word2vec_cbow" : Trains a Word2Vec Embedding using the Continuous Bag-Of-Words method
                                    and Negative-Sampling.
                - "lsa_docterm" : Trains an Embedding by using an SVD on a Document-Term Matrix.
                - "lsa_tfidf" : Trains an Embedding by using an SVD on a TF-IDFized Document-Term Matrix.
        min_count : int
            Minimum number of occurence of a word to be included in the vocabulary
        """
        self.tokens_column = tokens_column
        self.input_data = None
        self.word2id = {}
        self.embedding = None
        self.method = method
        self.workers = workers

        if self.method in ["word2vec_sg", "word2vec_cbow"]:
            self.train_params = {
                "size": size,
                "alpha": 0.025,
                "min_count": min_count,
                "max_vocab_size": None,
                "sample": 0.001,
                "seed": random_seed,
                "workers": workers,
                "min_alpha": 0.0001,
                "negative": 5,
                "hs": 0,
                "ns_exponent": 0.75,
                "cbow_mean": 1,
                "iter": iter,
                "null_word": 0,
                "trim_rule": None,
                "sorted_vocab": 1,
                "batch_words": 10000,
                "compute_loss": False,
                "callbacks": (),
                "max_final_vocab": None,
            }

            if self.method == "word2vec_sg":
                self.train_params["sg"] = 1
                self.train_params["window"] = 10

            elif self.method == "word2vec_cbow":
                self.train_params["sg"] = 0
                self.train_params["window"] = 5

        else:
            raise ValueError(
                f"Error: Embedding method {method} not recognized or not implemented yet ;)."
            )

[docs]    def save(self, filepath):
        """Method to save Embedding object."""
        self.embedding.save(filepath)

[docs]    def load(self, filepath):
        """Method to load Embedding object."""
        self.embedding = KeyedVectors.load(filepath, mmap="r")
        return self

[docs]    def train(self, X):
        """Train embeddings with the desired word embedding algorithm (default is Word2Vec).
        Parameters
        ----------
        X : pd.Dataframe
            Containing a clean body column.
        """

        logger.info("Start training word embeddings")

        # Get token corpus
        self.input_data = X[self.tokens_column].tolist()

        if self.method in ["word2vec_sg", "word2vec_cbow"]:
            self.train_word2vec()

        logger.info("Finished training word embeddings")

[docs]    def train_word2vec(self):
        """Fits a Word2Vec Embedding on the given documents, and update the embedding attribute."""
        embedding = Word2Vec(
            vector_size=self.train_params["size"],
            alpha=self.train_params["alpha"],
            window=self.train_params["window"],
            min_count=self.train_params["min_count"],
            max_vocab_size=self.train_params["max_vocab_size"],
            sample=self.train_params["sample"],
            seed=self.train_params["seed"],
            workers=self.train_params["workers"],
            min_alpha=self.train_params["min_alpha"],
            sg=self.train_params["sg"],
            hs=self.train_params["hs"],
            negative=self.train_params["negative"],
            ns_exponent=self.train_params["ns_exponent"],
            cbow_mean=self.train_params["cbow_mean"],
            epochs=self.train_params["iter"],
            null_word=self.train_params["null_word"],
            trim_rule=self.train_params["trim_rule"],
            sorted_vocab=self.train_params["sorted_vocab"],
            batch_words=self.train_params["batch_words"],
            compute_loss=self.train_params["compute_loss"],
            callbacks=self.train_params["callbacks"],
            max_final_vocab=self.train_params["max_final_vocab"],
        )

        embedding.build_vocab(self.input_data)
        embedding.train(
            self.input_data,
            total_examples=embedding.corpus_count,
            epochs=self.train_params["iter"],
        )

        self.embedding = embedding.wv