Source code for mip_dmp.process.embedding

# Copyright 2023 The HIP team, University Hospital of Lausanne (CHUV), Switzerland & Contributors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Module that provides function to handle word embeddings and operations on them."""

# External imports
import numpy as np
from scipy import spatial
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

# Internal imports
from mip_dmp.utils.io import load_glove_model, load_c2v_model


[docs]def glove_embedding(text, glove_model): """Find the Glove embedding for the text. Parameters ---------- text : str Text to be embedded. glove_model : str Glove model to be used, loaded by the gensim library. Returns ------- numpy.ndarray Glove embedding for the text. """ def preprocess_text(text): """Preprocess the text. Parameters ---------- text : str Text to be preprocessed. Returns ------- str Preprocessed text. """ # Lowercase the text. text = text.lower() # Tokenize the text. text = [s for s in text if s != "" and s != "_"] # Make a list of characters. return text # Preprocess the text. text = preprocess_text(text) # Find the Glove embedding for the text. embedding = np.sum(np.array([glove_model[i] for i in text]), axis=0) return embedding
[docs]def chars2vec_embedding(text, chars2vec_model): """Find the chars2vec embedding for the text. Parameters ---------- text : str Text to be embedded. chars2vec_model : str chars2vec model to be used, loaded by the gensim library. Returns ------- numpy.ndarray chars2vec embedding for the text. """ # Find the chars2vec embedding for the text. # The chars2vec model expects a list of strings as input. # The output is a list of embeddings, so we take the first element. embedding = chars2vec_model.vectorize_words([text])[0] return embedding
[docs]def embedding_similarity(x_embedding, y_embedding): """Find the matches based on chars2vec embeddings and cosine similarity. Parameters ---------- x_embedding : str String to compare against. y_embedding : str String to compare. chars2vec_model : str chars2vec model to be used, loaded by the gensim library. Returns ------- float Cosine similarity between the two chars2vec embeddings of the strings. """ return spatial.distance.cosine(x_embedding, y_embedding)
[docs]def generate_embeddings(words: list, embedding_method: str = "chars2vec"): """Generate embeddings for a list of words. Parameters ---------- words : list List of words to generate embeddings for. embedding_method : str Embedding method to be used, either "chars2vec" or "glove". Returns ------- list List of embeddings for the words. """ print(f"> Generating embeddings for {len(words)} words...") if embedding_method == "chars2vec": c2v_model = load_c2v_model() embeddings = [chars2vec_embedding(word, c2v_model) for word in words] elif embedding_method == "glove": glove_model = load_glove_model() embeddings = [glove_embedding(word, glove_model) for word in words] else: embeddings = None return embeddings
[docs]def find_n_closest_embeddings( word_embedding: np.array, embeddings: list, embedding_words: list, n: int = 5 ): """Find the n closest embeddings to the given embedding. Parameters ---------- word_embedding : numpy.ndarray Embedding to find the n closest embeddings to. embeddings : list List of embeddings to find the closest embeddings to the given embedding in. embedding_words : list List of words corresponding to the embeddings that will be resorted and reduced accordingly. n : int Number of closest embeddings to find. Returns ------- dict Dictionary containing the n closest embeddings, their distances to the given embedding, and the words corresponding to the embeddings in the form:: { "distances": [float], "embeddings": [numpy.ndarray], "embedding_words": [str] } """ distances = np.array( [spatial.distance.cosine(word_embedding, embedding) for embedding in embeddings] ).astype(np.float32) sorted_indices = np.argsort(distances) return dict( { "distances": [distances[i] for i in sorted_indices[0:n]], "embeddings": [embeddings[i] for i in sorted_indices[0:n]], "embedding_words": [embedding_words[i] for i in sorted_indices[0:n]], } )
[docs]def reduce_embeddings_dimension( embeddings: list, reduce_method: str = "tsne", n_components: int = 3 ): """Reduce the dimension of the embeddings, mainly for visualization purposes. Parameters ---------- embeddings : list List of embeddings to reduce the dimension of. reduce_method : str Method to use to reduce the dimension, either "tsne" or "pca". n_components : int Number of components to reduce the dimension to. Returns ------- list List of reduced embeddings. """ print( f"> Reducing embeddings dimensionality to {n_components} using {reduce_method}..." ) if reduce_method == "tsne": tsne_model = TSNE( perplexity=40, n_components=n_components, init="pca", n_iter=2500, random_state=42, ) reduction_values = tsne_model.fit_transform(np.array(embeddings)) elif reduce_method == "pca": pca_model = PCA(n_components=n_components, random_state=42) reduction_values = pca_model.fit_transform(np.array(embeddings)) else: print(f"ERROR: Invalid reduction method ({reduce_method})!") reduction_values = None return ( reduction_values[:, 0], reduction_values[:, 1], reduction_values[:, 2], )