Source code for semsim.wntest

# This code will be used to test WordNet's ability to match words based on semantic similarity
from nltk.corpus import wordnet as wn
import csv

[docs]def build_known_file(known_words_filename, unknown_words_filename, synset_csv_filename, output_filename, **kwargs):
    # Build needed data structures
    known_words_file = open(known_words_filename, "rb")

    known_words_dict = {}
    final_word_list = []

    # Process known words file to build the two structures
    line_num = 0
    for line in known_words_file:
        line = line.strip().lower()
        # Checks to make sure line isn't blank
        if line:
            known_words = line.split(",")
            # Checks to make sure there is at least one action to avoid exception
            if len(known_words) > 0:
                # Add the first action and its action set index to the dictionary to speed up lookups
                # The second value in this tuple will be replaced by the correct synset when the synset file is processed
                known_words_dict[known_words[0].strip()] = (line_num, "synset not found")
                for known_word in known_words[1:]:
                    known_word = known_word.strip()
                    # Add each known word to the final word list - don't add the first known word in the line to this, since it is already stored in the dictionary
                    final_word_list.append((known_word, line_num))
                line_num += 1

    # Process the file of synsets
    with open(synset_csv_filename, "rb") as synset_file:
        synset_file_reader = csv.reader(synset_file)
        # Assuming first line in CSV file is a header, so skip it
        next(synset_file_reader, None)
        for row in synset_file_reader:
            # Change the known word tuple's synset value
            known_words_dict[row[0].lower()] = (known_words_dict[row[0].lower()][0], row[1])

    unknown_words_file = open(unknown_words_filename, "rb")

    # Process unknown words and map them to known ones
    for line in unknown_words_file:
        unknown_word = line.strip().lower()

        if unknown_word:

            max_sem_sim_score = -1 # If words are not semantically similar, this value will not be changed
            max_unknown_synset = None
            max_known_synset = None
            known_choice = "No match found"
            match_found = False

            for known, known_tuple in known_words_dict.iteritems():

                known_line_num = known_tuple[0]
                known_synset = wn.synset(known_tuple[1])

                # If the unknown word is a lemma of the known sysnset, then the unknown word is a synonym
                if unknown_word in known_synset.lemma_names():
                    sem_sim_score = 1

                    if sem_sim_score > max_sem_sim_score:

                        max_sem_sim_score = sem_sim_score
                        match_line_num = known_line_num
                        match_found = True

                else:
                    # Check if the pos argument has been provided
                    if 'pos' in kwargs:
                        pos = kwargs['pos']
                    else:
                        pos = "none"

                    if pos.lower() == "verb":
                        unknown_synsets = wn.synsets(unknown_word, wn.VERB)
                    elif pos.lower() == "noun":
                        unknown_synsets = wn.synsets(unknown_word, wn.NOUN)
                    elif pos.lower() == "adj":
                        unknown_synsets = wn.synsets(unknown_word, wn.ADJ)
                    elif pos.lower() == "adv":
                        unknown_synsets = wn.synsets(unknown_word, wn.ADV)
                    else:
                        unknown_synsets = wn.synsets(unknown_word)

                    for unknown_synset in unknown_synsets:

                        sem_sim_score = unknown_synset.path_similarity(known_synset)

                        if sem_sim_score > max_sem_sim_score:

                            max_sem_sim_score = sem_sim_score
                            match_line_num = known_line_num
                            match_found = True

            # If a match was found, then add the unknown word to the final word list
            if match_found:
                final_word_list.append((unknown_word, match_line_num))


    # The final word list has been built, so put all matching words into the input file at specified lines

    # Sort the final word list by line number, so each word can be written in sequence
    final_word_list = sorted(final_word_list, key=lambda tup: tup[1])

    # Create a list of the actions that will be placed first in each line - these are the previously known words
    first_word_list = []
    for known, known_tuple in known_words_dict.iteritems():
        first_word_list.append((known, known_tuple[0]))
    print first_word_list
    # Sort by the known word's line number and put only the actual word into the first action list
    first_word_list = [k_tup[0] for k_tup in sorted(first_word_list, key=lambda tup: tup[1])]
    print first_word_list

    out_file = open(output_filename, "wb")
    current_line = -1
    for word_tuple in final_word_list:
        # Whenever the current line is not the line that the current word needs to go in, start a new line with the appropriate first word
        while not current_line == word_tuple[1]:
            # Must write to a new line
            current_line += 1
            # Start a new line if this isn't the first line of the file
            if not current_line == 0:
                out_file.write("\n")
            # Write the known action to be the first action of the line
            out_file.write(first_word_list[current_line])

        # Now that the current line is the line the current word needs to go on, write the current word
        out_file.write("," + word_tuple[0])


[docs]def sem_sim_test2(known_words_filename, unknown_words_filename, **kwargs):
    """
    Tests semantic similarity mapping from unknown words to known words. Synsets of the unknown words are not known in advance and those of the known words are determined in advance.

    Accepts a CSV file of known words paired with their assumed WordNet synset and a text file of unknown words (with one word per line). Each unknown word is matched up with the known word that it is most semantically similar to. "Known" words are words that LILI has been preprogrammed to recognize or respond to in some way, while "unknown" words are those that LILI does not understand by default. Semantic similarity measures are made using WordNet and attempt to allow LILI to understand an open vocabulary beyond the words and phrases it has been preprogrammed to respond to. This test returns a list of :class:`~semsim.wntest.SemanticSimilarityResult` objects to store the results of the test for each unknown word.

    Args:
        known_words_filename (str): The filename of the CSV file containing known words paired with their assumed synsets
        unknown_words_filename (str): The filename of the text file containing unknown words

    Kwargs:
        pos (str): The part of speech of the words to be evaluated. Can have the values "verb", "noun", "adj", or "adv". If neither of these values are used or no value is provided, searching the synsets of the unknown word will not be filtered by part of speech, resulting in more processing time and potentially less accurate results

    Returns:
        list: The sorted list of :class:`~wntest.SemanticSimilarityResult` objects
    """

    # Start an empty list of results
    results = []

    # Start an empty list of known verbs
    known_words = []

    # Open the CSV file of known words and read contents
    with open(known_words_filename, "rb") as known_words_file:
        known_word_reader = csv.reader(known_words_file)
        # Assuming first line in CSV file is a header
        next(known_word_reader, None)
        for row in known_word_reader:
            # Add to the list of known verbs
            known_words.append((row[0].lower(), row[1]))

    # Open the file of unknown words and begin processing
    unknown_words_file = open(unknown_words_filename, "rb")
    for line in unknown_words_file:

        max_sem_sim_score = -1 # If words are not semantically similar, this value will not be changed
        max_unknown_synset = None
        max_known_synset = None
        known_choice = "No match found"
        match_found = False
        unknown = line.lower().strip()

        for known_tuple in known_words:

            known = known_tuple[0]
            known_synset = wn.synset(known_tuple[1])

            if unknown in known_synset.lemma_names():
                sem_sim_score = 1

                if sem_sim_score > max_sem_sim_score:

                    max_sem_sim_score = sem_sim_score
                    max_unknown_synset = unknown_synset
                    max_known_synset = known_synset
                    known_choice = known
                    match_found = True

            else:
                # Check if the pos argument has been provided
                if 'pos' in kwargs:
                    pos = kwargs['pos']
                else:
                    pos = "none"

                if pos.lower() == "verb":
                    unknown_synsets = wn.synsets(unknown, wn.VERB)
                elif pos.lower() == "noun":
                    unknown_synsets = wn.synsets(unknown, wn.NOUN)
                elif pos.lower() == "adj":
                    unknown_synsets = wn.synsets(unknown, wn.ADJ)
                elif pos.lower() == "adv":
                    unknown_synsets = wn.synsets(unknown, wn.ADV)
                else:
                    unknown_synsets = wn.synsets(unknown)

                for unknown_synset in unknown_synsets:

                    sem_sim_score = unknown_synset.path_similarity(known_synset)

                    if sem_sim_score > max_sem_sim_score:

                        max_sem_sim_score = sem_sim_score
                        max_unknown_synset = unknown_synset
                        max_known_synset = known_synset
                        known_choice = known
                        match_found = True

        if match_found:
            results.append(SemanticSimilarityResult(unknown, known_choice, max_unknown_synset.name(), max_known_synset.name(), max_unknown_synset.definition(), max_known_synset.definition(), max_sem_sim_score))
        else:
            results.append(SemanticSimilarityResult(unknown,"No match found","N/A","N/A","N/A","N/A",max_sem_sim_score))

        print ("Finished processing " + unknown)

    return results

[docs]def process_results(results_list):
    """
    Sorts a list of :class:`~wntest.SemanticSimilarityResult` objects in descsending order by semantic similarity score

    Args:
        results_list (list): The list of :class:`~wntest.SemanticSimilarityResult` objects to be sorted

    Returns:
        list: The sorted list of :class:`~wntest.SemanticSimilarityResult` objects
    """
    return sorted(results_list, key=lambda res:res.sem_sim_score, reverse=True)

[docs]def output_results(results_list, output_filename):
    """
    Prints the given list of :class:`~wntest.SemanticSimilarityResult` objects to a CSV file

    Given a list of :class:`~wntest.SemanticSimilarityResult` objects and a .csv filename, writes the values of the result objects to specified file.

    Args:
        results_list (list): The list of :class:`~wntest.SemanticSimilarityResult` objects to be printed to the CSV file
        output_filename (str): The name of the CSV file to be written to
    """
    # Open the output file for writing
    out_file = open(output_filename, "wb")
    out_writer = csv.writer(out_file)

    # Writes a header to the output file
    out_writer.writerow(("unknown","known","unknown_synset","known_synset","unknown_definition","known_definition","semantic_similarity_score"))

    for res in results_list:
        out_writer.writerow((res.unknown, res.known, res.unknown_synset, res.known_synset, res.unknown_definition, res.known_definition,
        res.sem_sim_score))

[docs]def filter_results(results_list, threshold):
    """
    Returns a filtered list of the results of the given list based on the given semantic similarity score threshold

    Given a list of :class:`~SemanticSimilarityResult` objects and a threshold value, filters the list removing result objects with semantic similarity scores less than the threshold. Returns the filtered list.

    Args:
        results_list (list): The list of :class:`~SemanticSimilarityResult` objects to be filtered
        threshold (number): The semantic similarity score threshold at which results with a score lower than this threshold will be removed from C{results_list}

    Returns:
        list: The filtered list of C{SemanticSimilarityResult} objects
    """
    return [res for res in results_list if res.sem_sim_score >= threshold]


[docs]class SemanticSimilarityResult:
    """ This class is used to package results from semantic similarity tests. Each object of this class holds the result of a single unknown to known word mapping.

    This class stores a variety of information for analysis purposes, including the synsets with the highest similarity score, their definitions, the semantic similarity score, and most importantly the known word that the unknown word will map to. This class is to be used for testing and analysis purposes to see how the semantic similarity measure may be improved. The only pieces of information important to the final result of the LILI interpreter is the known word that the unknown word is mapped to.

    Attributes:
        unknown (str): The unknown word that has been mapped to a known word
        known (str): The known word that has been mapped to
        unknown_synset (wn.Synset): The synset of the unknown word
        known_synset (wn.Synset): The synset of the known word that has been mapped to
        unknown_defintion (str): The definition of unknown_synset
        known_definition (str): The definition of known_synset
        sem_sim_score (number): The semantic similarity score between unknown_synset and known_synset
    """
    def __init__(self, unknown, known, unknown_synset, known_synset, unknown_definition, known_definition, sem_sim_score):
        """ Constructor for the :class:`wntest.SemanticSimilarityResult` class. See the class's documentation for details on each parameter
        """
        self.unknown = unknown
        self.known = known
        self.unknown_synset = unknown_synset
        self.known_synset = known_synset
        self.unknown_definition = unknown_definition
        self.known_definition = known_definition
        self.sem_sim_score = sem_sim_score