Source code for mip_dmp.process.embedding

# Copyright 2023 The HIP team, University Hospital of Lausanne (CHUV), Switzerland & Contributors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Module that provides function to handle word embeddings and operations on them."""

# External imports
import numpy as np
from scipy import spatial
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

# Internal imports
from mip_dmp.utils.io import load_glove_model, load_c2v_model


[docs]def glove_embedding(text, glove_model):
    """Find the Glove embedding for the text.

    Parameters
    ----------
    text : str
        Text to be embedded.

    glove_model : str
        Glove model to be used, loaded by the gensim library.

    Returns
    -------
    numpy.ndarray
        Glove embedding for the text.
    """

    def preprocess_text(text):
        """Preprocess the text.

        Parameters
        ----------
        text : str
            Text to be preprocessed.

        Returns
        -------
        str
            Preprocessed text.
        """
        # Lowercase the text.
        text = text.lower()
        # Tokenize the text.
        text = [s for s in text if s != "" and s != "_"]  # Make a list of characters.
        return text

    # Preprocess the text.
    text = preprocess_text(text)
    # Find the Glove embedding for the text.
    embedding = np.sum(np.array([glove_model[i] for i in text]), axis=0)
    return embedding


[docs]def chars2vec_embedding(text, chars2vec_model):
    """Find the chars2vec embedding for the text.

    Parameters
    ----------
    text : str
        Text to be embedded.

    chars2vec_model : str
        chars2vec model to be used, loaded by the gensim library.

    Returns
    -------
    numpy.ndarray
        chars2vec embedding for the text.
    """
    # Find the chars2vec embedding for the text.
    # The chars2vec model expects a list of strings as input.
    # The output is a list of embeddings, so we take the first element.
    embedding = chars2vec_model.vectorize_words([text])[0]
    return embedding


[docs]def embedding_similarity(x_embedding, y_embedding):
    """Find the matches based on chars2vec embeddings and cosine similarity.

    Parameters
    ----------
    x_embedding : str
        String to compare against.

    y_embedding : str
        String to compare.

    chars2vec_model : str
        chars2vec model to be used, loaded by the gensim library.

    Returns
    -------
    float
        Cosine similarity between the two chars2vec embeddings of the strings.
    """
    return spatial.distance.cosine(x_embedding, y_embedding)


[docs]def generate_embeddings(words: list, embedding_method: str = "chars2vec"):
    """Generate embeddings for a list of words.

    Parameters
    ----------
    words : list
        List of words to generate embeddings for.

    embedding_method : str
        Embedding method to be used, either "chars2vec" or "glove".

    Returns
    -------
    list
        List of embeddings for the words.
    """
    print(f"> Generating embeddings for {len(words)} words...")
    if embedding_method == "chars2vec":
        c2v_model = load_c2v_model()
        embeddings = [chars2vec_embedding(word, c2v_model) for word in words]
    elif embedding_method == "glove":
        glove_model = load_glove_model()
        embeddings = [glove_embedding(word, glove_model) for word in words]
    else:
        embeddings = None
    return embeddings


[docs]def find_n_closest_embeddings(
    word_embedding: np.array, embeddings: list, embedding_words: list, n: int = 5
):
    """Find the n closest embeddings to the given embedding.

    Parameters
    ----------
    word_embedding : numpy.ndarray
        Embedding to find the n closest embeddings to.

    embeddings : list
        List of embeddings to find the closest embeddings to the given embedding in.

    embedding_words : list
        List of words corresponding to the embeddings that will be resorted and reduced accordingly.

    n : int
        Number of closest embeddings to find.

    Returns
    -------
    dict
        Dictionary containing the n closest embeddings, their distances to the given embedding,
        and the words corresponding to the embeddings in the form::

            {
                "distances": [float],
                "embeddings": [numpy.ndarray],
                "embedding_words": [str]
            }
    """
    distances = np.array(
        [spatial.distance.cosine(word_embedding, embedding) for embedding in embeddings]
    ).astype(np.float32)
    sorted_indices = np.argsort(distances)
    return dict(
        {
            "distances": [distances[i] for i in sorted_indices[0:n]],
            "embeddings": [embeddings[i] for i in sorted_indices[0:n]],
            "embedding_words": [embedding_words[i] for i in sorted_indices[0:n]],
        }
    )


[docs]def reduce_embeddings_dimension(
    embeddings: list, reduce_method: str = "tsne", n_components: int = 3
):
    """Reduce the dimension of the embeddings, mainly for visualization purposes.

    Parameters
    ----------
    embeddings : list
        List of embeddings to reduce the dimension of.

    reduce_method : str
        Method to use to reduce the dimension, either "tsne" or "pca".

    n_components : int
        Number of components to reduce the dimension to.

    Returns
    -------
    list
        List of reduced embeddings.
    """
    print(
        f"> Reducing embeddings dimensionality to {n_components} using {reduce_method}..."
    )
    if reduce_method == "tsne":
        tsne_model = TSNE(
            perplexity=40,
            n_components=n_components,
            init="pca",
            n_iter=2500,
            random_state=42,
        )
        reduction_values = tsne_model.fit_transform(np.array(embeddings))
    elif reduce_method == "pca":
        pca_model = PCA(n_components=n_components, random_state=42)
        reduction_values = pca_model.fit_transform(np.array(embeddings))
    else:
        print(f"ERROR: Invalid reduction method ({reduce_method})!")
        reduction_values = None
    return (
        reduction_values[:, 0],
        reduction_values[:, 1],
        reduction_values[:, 2],
    )