" \"\"\"Finds NP (nounphrase) leaf nodes of a chunk tree.\"\"\"\n",
" for subtree in tree.subtrees(filter=lambda t: t.label() == 'NP'):\n",
" yield subtree.leaves()\n",
"\n",
" def normalise(self, word):\n",
" \"\"\"Normalises words to lowercase and stems and lemmatizes it.\"\"\"\n",
" word = word.lower()\n",
" word = stemmer.stem(word)\n",
" word = lemmatizer.lemmatize(word)\n",
" return word\n",
"\n",
" def acceptable_word(self, word):\n",
" \"\"\"Checks conditions for acceptable word: length, stopword.\"\"\"\n",
" accepted = bool(2 <= len(word) <= 40\n",
" and word.lower() not in stopwords)\n",
" return accepted\n",
"\n",
" def get_terms(self,tree):\n",
" for leaf in self.leaves(tree):\n",
" term = [self.normalise(w) for w, t in leaf if self.acceptable_word(w)]\n",
" yield term\n",
"\n",
" def extract(self):\n",
" terms = self.get_terms(self.execute())\n",
" matches = []\n",
" for term in terms:\n",
" for word in term:\n",
" matches.append(word)\n",
" return matches\n"
]
},
{
"cell_type": "code",
"execution_count": 187,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['bimorph', 'forc', 'sensor']"
]
},
"execution_count": 187,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# example\n",
"document = 'A novel device was designed to measure drainage dynamics of thin liquid films confined between a solid particle, an immiscible liquid droplet, and/or gas bubble. Equipped with a bimorph force sensor'\n",