Source code for interpreter.extractor

from nltk.stem.snowball import SnowballStemmer
import os

[docs]def is_direction(word):
    """
    Checks if a ``word`` represents a direction.

    The word is checked against a hard-coded list of directional words. If there is a match, then the ``word`` is determined to be a directional word. This function is present to provide a space to make this operation more robust in the future.

    Args:
        word (str): The word to be checked

    Returns:
        bool: ``True`` if the ``word`` represents a directional word, ``False`` if not
    """
    # Words that can be interpreted as a direction - for use with the move and turn object extractors
    directional_words = ["left", "right", "up", "down", "forward", "backward"]

    for direction in directional_words:
        if word.lower() == direction:
            return True

    return False



[docs]def is_noun(tag):
    """
    Checks if a part of speech tag represents a noun. The tags ``"PRP"`` and those that begin with ``"NN"`` are considered nouns.

    Args:
        tag (str): The part of speech tag to be checked

    Returns:
        bool: ``True`` if the tag represents a noun, ``False`` if not
    """
    # Considers personal pronouns as nouns as long as any type of tagged noun (proper noun, etc.)
    return (tag == "PRP" or tag.startswith("NN"))

[docs]def is_preposition(tag):
    """
    Checks if a part of speech tag represents a preposition. The tags ``"TO"`` and ``"IN"`` considered to be prepositions.

    Args:
        tag (str): The part of speech tag to be checked

    Returns:
        bool: ``True`` if the tag represents a preposition, ``False`` if not
    """
    # IN is the general preposition tag, but the word "to" has its own TO tag whenever it is being used as a preposition
    return (tag == "TO" or tag == "IN")

[docs]def object_dict_follow(sent):
    """
    Extracts objects out of a sentence that contains *follow* as its :ref:`action <action>`

    Rules:

    1. The first noun encountered is always the *person*
    2. The second noun (if included) is always the *place*

    Objects:

    1. *person* - The person who will be followed
    2. *place* - The location that the person will be followed to

    Args:
        sent (list): A part of speech tagged list of tokens representing a sentence

    Returns:
        dict: An :ref:`object dictionary <object-dictionary>` for the command
    """

    object_dict = {}
    for token in sent:
        if is_noun(token[1]):
            # The current length of object_dict shows how many other nouns have been extracted from the sentence
            if len(object_dict) == 0:
                object_dict["person"] = token[0].lower()
            elif len(object_dict) == 1:
                object_dict["place"] = token[0].lower()

    return object_dict

[docs]def object_dict_turn(sent):
    """
    Extracts objects out of a sentence that contains *turn* as its :ref:`action <action>`.

    Currently uses the same rules as :meth:`~interpreter.extractor.object_dict_move`

    Args:
        sent (list): A part of speech tagged list of tokens representing a sentence

    Returns:
        dict: An :ref:`object dictionary <object-dictionary>` for the command
    """

    return object_dict_move(sent)

[docs]def object_dict_stop(sent):
    """
    Extracts objects out of a sentence that contains *stop* as its :ref:`action <action>`

    Currently returns an empty dictionary.

    Args:
        sent (list): A part of speech tagged list of tokens representing a sentence

    Returns:
        dict: An :ref:`object dictionary <object-dictionary>` for the command - currently returns an empty dictionary under all inputs
    """
    return {}

[docs]def object_dict_move(sent):
    """
    Extracts objects out of a sentence that contains *move* as its :ref:`action <action>`

    Rules:

    1. Only the first noun is detected as an object
    2. If the first noun is preced by a preposition, then it is the *place*
    3. If the first noun is not preceded by a preposition, then it is the *direction*

    Objects:

    1. *place* - A location to move to
    2. *direction* - A direction to move in

    Args:
        sent (list): A part of speech tagged list of tokens representing a sentence

    Returns:
        dict: An :ref:`object dictionary <object-dictionary>` for the command
    """

    object_dict = {}
    prep_found = False

    for token in sent:
        # The directional words tend to be tagged as one of these four parts of speech
        if (token[1] == "VBD" or token[1] == "NN" or token[1] == "IN" or token[1] == "RB") and (is_direction(token[0])):
            object_dict["direction"] = token[0].lower()
        elif is_noun(token[1]):
            object_dict["place"] = token[0].lower()

    return object_dict

[docs]def object_dict_talk(sent):
    """
    Extracts objects out of a sentence that contains *talk* as its :ref:`action <action>`

    Rules:

    1. The noun to come after the word *about* is the *topic*
    2. The noun to come after any preposition that is not *about* is the *person*
    3. Other nouns will be tagged as *unknown*

    Objects:

    1. *person* - The person to talk to
    2. *topic* - The subject to talk about
    3. *unknown* - The role of this noun is not known

    Args:
        sent (list): A part of speech tagged list of tokens representing a sentence

    Returns:
        dict: An :ref:`object dictionary <object-dictionary>` for the command
    """

    object_dict = {}
    prep_found = False
    about_found = False

    for token in sent:

        if token[0].lower() != "about" and is_preposition(token[1]):
            prep_found = True
        elif token[0].lower() == "about":
            about_found = True

        if is_noun(token[1]):
            if prep_found and not about_found:
                object_dict["person"] = token[0].lower()
                prep_found = False
            elif about_found and not prep_found:
                object_dict["topic"] = token[0].lower()
                about_found = False
            else:
                obj_tag = "unknown"
                object_dict["unknown"] = token[0].lower()

    return object_dict

[docs]def object_dict_show(sent):
    """
    Extracts objects out of a sentence that contains *show* as its :ref:`action <action>`

    Rules:

    1. The verb preceded by a *to* is the *shown_action*
    2. The noun that is not preceded by a determiner or *to* is the *person*
    3. The noun that is preceded either by a determiner or the *shown_action* is the *object*

    Objects:

    1. *shown_action* - The action that will be shown in a video
    2. *person* - The person who will be shown the action or object
    3. *object* - The object that is acted on in the video or a static object to be shown as a picture
    4. *video_title* - The title of the video to be played; it is currently generated by concatentating the *shown_action* with the *object*

    Args:
        sent (list): A part of speech tagged list of tokens representing a sentence

    Returns:
        dict: An :ref:`object dictionary <object-dictionary>` for the command
    """

    object_dict = {}
    prec_found = False
    to_found = False

    for token in sent:
        if token[1] == "TO":
            to_found = True
        elif token[1] == "DT":
            prec_found = True
        elif is_noun(token[1]):
            if prec_found:
                object_dict["object"] = token[0].lower()
            else:
                object_dict["person"] = token[0].lower()
        elif token[1] ==  "VB":
            if to_found:
                object_dict["show_action"] = token[0].lower()
                prec_found = True

    # Create the stemmer to get root words if needed
    stemmer = SnowballStemmer("english")
    if "object" in object_dict:
        search_res = binary_search_shown_words(object_dict["object"], known_shown_objects)
        if search_res > -1:
            object_dict["object"] = first_shown_objects[search_res]
        else:
            # If the object word wasn't found, try looking for its stem
            stem = stemmer.stem(object_dict["object"])
            search_res = binary_search_shown_words(stem, known_shown_objects)
            if search_res > -1:
                object_dict["object"] = first_shown_objects[search_res]


    if "show_action" in object_dict:

        search_res = binary_search_shown_words(object_dict["show_action"], known_shown_actions)
        if search_res > -1:
            object_dict["show_action"] = first_shown_actions[search_res]
        else:
            # If the show action word wasn't found, try looking for its stem
            stem = stemmer.stem(object_dict["show_action"])
            search_res = binary_search_shown_words(stem, known_shown_actions)
            if search_res > -1:
                object_dict["show_action"] = first_shown_actions[search_res]

        video_title = object_dict["show_action"]
        if "object" in object_dict:
            video_title = video_title + "-" + object_dict["object"]
        object_dict["video_title"] = video_title.lower()

    return object_dict

[docs]def build_shown_words(filename):
    # Initializing data structures
    known_words = []
    first_words = []
    line_num = 0

    # Start reading input file of known actions
    inp_file = open(filename, "rb")
    for line in inp_file:
        line = line.strip().lower()
        # Checks to make sure the line isn't an empty string after trimming whitespace
        # Blank lines are ignored completely
        if line:
            words = line.split(",")
            # Checks to make sure there is at least one action, avoid exception
            if len(words) > 0:
                try:
                    first_words.append(words[0])
                    for word in words:
                        word = word.strip()
                        # Add each known action and its action set index to the list to be returned
                        known_words.append((word, line_num))
                    line_num += 1
                except AttributeError: # Occurs if getattr fails
                    sys.stderr.write("Error: There is no object extraction function called " + func_name + "\n")
                except StandardError as err: # Catches any other error
                    sys.stderr.write(str(err) + "\n")

    # Sorts the list of known actions by A-Z alphabetical order
    known_words = sorted(known_words, key=lambda tup: tup[0])

    return (known_words, first_words)

[docs]def binary_search_shown_words(target, pool):

    # If the search pool has been exhausted, the target is not in the pool
    if (len(pool) == 0):
        return -1

    # Gets middle index of the remaining pool
    mid = len(pool)/2

    if target < pool[mid][0]: # Target must be in lower half of pool
        return binary_search_shown_words(target, pool[:mid])
    elif target > pool[mid][0]: # Target must be in higher half of pool
        return binary_search_shown_words(target, pool[mid+1:])
    else: # Match has been found
        return pool[mid][1]

shown_actions_path = ""
objects_path = ""
if "lili-interpreter" in os.listdir("."): # If this is true, code is being run in LSSWinRobot repo
    shown_actions_path = "lili-interpreter/"
    objects_path = "lili-interpreter/"
elif "source" in os.listdir("."): # If this is true, code is being run by make in Sphinx documentation generator
    shown_actions_path = "source/"
    objects_path = "source/"

shown_actions_path = shown_actions_path + "input_files/known_words/known_shown_actions_small.txt"
objects_path = objects_path + "input_files/known_words/shown_objects_small.txt"

print os.listdir(".")


# Builds synonym lists that are utilized when resolving shown actions and objects to words that are already known by LILI
shown_action_res = build_shown_words(shown_actions_path)
shown_object_res = build_shown_words(objects_path)

known_shown_actions = sorted(shown_action_res[0], key=lambda tup: tup[0])
first_shown_actions = shown_action_res[1]

known_shown_objects = sorted(shown_object_res[0], key=lambda tup: tup[0])
first_shown_objects = shown_object_res[1]