Source code for interpreter.extractor

from nltk.stem.snowball import SnowballStemmer
import os

[docs]def is_direction(word): """ Checks if a ``word`` represents a direction. The word is checked against a hard-coded list of directional words. If there is a match, then the ``word`` is determined to be a directional word. This function is present to provide a space to make this operation more robust in the future. Args: word (str): The word to be checked Returns: bool: ``True`` if the ``word`` represents a directional word, ``False`` if not """ # Words that can be interpreted as a direction - for use with the move and turn object extractors directional_words = ["left", "right", "up", "down", "forward", "backward"] for direction in directional_words: if word.lower() == direction: return True return False
[docs]def is_noun(tag): """ Checks if a part of speech tag represents a noun. The tags ``"PRP"`` and those that begin with ``"NN"`` are considered nouns. Args: tag (str): The part of speech tag to be checked Returns: bool: ``True`` if the tag represents a noun, ``False`` if not """ # Considers personal pronouns as nouns as long as any type of tagged noun (proper noun, etc.) return (tag == "PRP" or tag.startswith("NN"))
[docs]def is_preposition(tag): """ Checks if a part of speech tag represents a preposition. The tags ``"TO"`` and ``"IN"`` considered to be prepositions. Args: tag (str): The part of speech tag to be checked Returns: bool: ``True`` if the tag represents a preposition, ``False`` if not """ # IN is the general preposition tag, but the word "to" has its own TO tag whenever it is being used as a preposition return (tag == "TO" or tag == "IN")
[docs]def object_dict_follow(sent): """ Extracts objects out of a sentence that contains *follow* as its :ref:`action <action>` Rules: 1. The first noun encountered is always the *person* 2. The second noun (if included) is always the *place* Objects: 1. *person* - The person who will be followed 2. *place* - The location that the person will be followed to Args: sent (list): A part of speech tagged list of tokens representing a sentence Returns: dict: An :ref:`object dictionary <object-dictionary>` for the command """ object_dict = {} for token in sent: if is_noun(token[1]): # The current length of object_dict shows how many other nouns have been extracted from the sentence if len(object_dict) == 0: object_dict["person"] = token[0].lower() elif len(object_dict) == 1: object_dict["place"] = token[0].lower() return object_dict
[docs]def object_dict_turn(sent): """ Extracts objects out of a sentence that contains *turn* as its :ref:`action <action>`. Currently uses the same rules as :meth:`~interpreter.extractor.object_dict_move` Args: sent (list): A part of speech tagged list of tokens representing a sentence Returns: dict: An :ref:`object dictionary <object-dictionary>` for the command """ return object_dict_move(sent)
[docs]def object_dict_stop(sent): """ Extracts objects out of a sentence that contains *stop* as its :ref:`action <action>` Currently returns an empty dictionary. Args: sent (list): A part of speech tagged list of tokens representing a sentence Returns: dict: An :ref:`object dictionary <object-dictionary>` for the command - currently returns an empty dictionary under all inputs """ return {}
[docs]def object_dict_move(sent): """ Extracts objects out of a sentence that contains *move* as its :ref:`action <action>` Rules: 1. Only the first noun is detected as an object 2. If the first noun is preced by a preposition, then it is the *place* 3. If the first noun is not preceded by a preposition, then it is the *direction* Objects: 1. *place* - A location to move to 2. *direction* - A direction to move in Args: sent (list): A part of speech tagged list of tokens representing a sentence Returns: dict: An :ref:`object dictionary <object-dictionary>` for the command """ object_dict = {} prep_found = False for token in sent: # The directional words tend to be tagged as one of these four parts of speech if (token[1] == "VBD" or token[1] == "NN" or token[1] == "IN" or token[1] == "RB") and (is_direction(token[0])): object_dict["direction"] = token[0].lower() elif is_noun(token[1]): object_dict["place"] = token[0].lower() return object_dict
[docs]def object_dict_talk(sent): """ Extracts objects out of a sentence that contains *talk* as its :ref:`action <action>` Rules: 1. The noun to come after the word *about* is the *topic* 2. The noun to come after any preposition that is not *about* is the *person* 3. Other nouns will be tagged as *unknown* Objects: 1. *person* - The person to talk to 2. *topic* - The subject to talk about 3. *unknown* - The role of this noun is not known Args: sent (list): A part of speech tagged list of tokens representing a sentence Returns: dict: An :ref:`object dictionary <object-dictionary>` for the command """ object_dict = {} prep_found = False about_found = False for token in sent: if token[0].lower() != "about" and is_preposition(token[1]): prep_found = True elif token[0].lower() == "about": about_found = True if is_noun(token[1]): if prep_found and not about_found: object_dict["person"] = token[0].lower() prep_found = False elif about_found and not prep_found: object_dict["topic"] = token[0].lower() about_found = False else: obj_tag = "unknown" object_dict["unknown"] = token[0].lower() return object_dict
[docs]def object_dict_show(sent): """ Extracts objects out of a sentence that contains *show* as its :ref:`action <action>` Rules: 1. The verb preceded by a *to* is the *shown_action* 2. The noun that is not preceded by a determiner or *to* is the *person* 3. The noun that is preceded either by a determiner or the *shown_action* is the *object* Objects: 1. *shown_action* - The action that will be shown in a video 2. *person* - The person who will be shown the action or object 3. *object* - The object that is acted on in the video or a static object to be shown as a picture 4. *video_title* - The title of the video to be played; it is currently generated by concatentating the *shown_action* with the *object* Args: sent (list): A part of speech tagged list of tokens representing a sentence Returns: dict: An :ref:`object dictionary <object-dictionary>` for the command """ object_dict = {} prec_found = False to_found = False for token in sent: if token[1] == "TO": to_found = True elif token[1] == "DT": prec_found = True elif is_noun(token[1]): if prec_found: object_dict["object"] = token[0].lower() else: object_dict["person"] = token[0].lower() elif token[1] == "VB": if to_found: object_dict["show_action"] = token[0].lower() prec_found = True # Create the stemmer to get root words if needed stemmer = SnowballStemmer("english") if "object" in object_dict: search_res = binary_search_shown_words(object_dict["object"], known_shown_objects) if search_res > -1: object_dict["object"] = first_shown_objects[search_res] else: # If the object word wasn't found, try looking for its stem stem = stemmer.stem(object_dict["object"]) search_res = binary_search_shown_words(stem, known_shown_objects) if search_res > -1: object_dict["object"] = first_shown_objects[search_res] if "show_action" in object_dict: search_res = binary_search_shown_words(object_dict["show_action"], known_shown_actions) if search_res > -1: object_dict["show_action"] = first_shown_actions[search_res] else: # If the show action word wasn't found, try looking for its stem stem = stemmer.stem(object_dict["show_action"]) search_res = binary_search_shown_words(stem, known_shown_actions) if search_res > -1: object_dict["show_action"] = first_shown_actions[search_res] video_title = object_dict["show_action"] if "object" in object_dict: video_title = video_title + "-" + object_dict["object"] object_dict["video_title"] = video_title.lower() return object_dict
[docs]def build_shown_words(filename): # Initializing data structures known_words = [] first_words = [] line_num = 0 # Start reading input file of known actions inp_file = open(filename, "rb") for line in inp_file: line = line.strip().lower() # Checks to make sure the line isn't an empty string after trimming whitespace # Blank lines are ignored completely if line: words = line.split(",") # Checks to make sure there is at least one action, avoid exception if len(words) > 0: try: first_words.append(words[0]) for word in words: word = word.strip() # Add each known action and its action set index to the list to be returned known_words.append((word, line_num)) line_num += 1 except AttributeError: # Occurs if getattr fails sys.stderr.write("Error: There is no object extraction function called " + func_name + "\n") except StandardError as err: # Catches any other error sys.stderr.write(str(err) + "\n") # Sorts the list of known actions by A-Z alphabetical order known_words = sorted(known_words, key=lambda tup: tup[0]) return (known_words, first_words)
[docs]def binary_search_shown_words(target, pool): # If the search pool has been exhausted, the target is not in the pool if (len(pool) == 0): return -1 # Gets middle index of the remaining pool mid = len(pool)/2 if target < pool[mid][0]: # Target must be in lower half of pool return binary_search_shown_words(target, pool[:mid]) elif target > pool[mid][0]: # Target must be in higher half of pool return binary_search_shown_words(target, pool[mid+1:]) else: # Match has been found return pool[mid][1]
shown_actions_path = "" objects_path = "" if "lili-interpreter" in os.listdir("."): # If this is true, code is being run in LSSWinRobot repo shown_actions_path = "lili-interpreter/" objects_path = "lili-interpreter/" elif "source" in os.listdir("."): # If this is true, code is being run by make in Sphinx documentation generator shown_actions_path = "source/" objects_path = "source/" shown_actions_path = shown_actions_path + "input_files/known_words/known_shown_actions_small.txt" objects_path = objects_path + "input_files/known_words/shown_objects_small.txt" print os.listdir(".") # Builds synonym lists that are utilized when resolving shown actions and objects to words that are already known by LILI shown_action_res = build_shown_words(shown_actions_path) shown_object_res = build_shown_words(objects_path) known_shown_actions = sorted(shown_action_res[0], key=lambda tup: tup[0]) first_shown_actions = shown_action_res[1] known_shown_objects = sorted(shown_object_res[0], key=lambda tup: tup[0]) first_shown_objects = shown_object_res[1]