Commit 17255091 by 20200203063

Replace sentence.py

parent cea76185
# encoding: utf-8
import nltk
from nltk.corpus import stopwords
stopwords = stopwords.words('english')
# stopwords = []
grammar = r"""
NBAR:
{<NN.*|JJ>*<NN.*>} # Nouns and Adjectives, terminated with Nouns
NP:
{<NBAR>}
{<NBAR><IN><NBAR>} # Above, connected with in/of/etc...
"""
lemmatizer = nltk.WordNetLemmatizer()
#stemmer = nltk.stem.porter.PorterStemmer()
class Sentence(object):
WORD_TOKENIZER = MyPottsTokenizer(preserve_case=False)
# WORD_TOKENIZER = MyPottsTokenizer(preserve_case=False)
LEMMATIZER = WordNetLemmatizer()
# LEMMATIZER = WordNetLemmatizer()
# 针对于每一句话抽取aspects
ASP_EXTRACTOR =
# ASP_EXTRACTOR =
def __init__(self):
def __init__(self,sentence):
self.sentence = sentence
def word_tokenize(self):
# def word_tokenize(self):
# return
# def pos_tag(self):
# return
# def lemmatize(self):
# return
# def contain_aspect(self):
# return
def extract_noun_phrase(self):
tree = self.execute()
nps = [np for np in self.get_terms(tree)]
return nps
def pos_tag(self):
def lemmatize(self):
def execute(self):
# Taken from Su Nam Kim Paper...
chunker = nltk.RegexpParser(grammar)
#toks = nltk.regexp_tokenize(text, sentence_re)
# #postoks = nltk.tag.pos_tag(toks)
toks = nltk.word_tokenize(self.sentence)
postoks = nltk.tag.pos_tag(toks)
tree = chunker.parse(postoks)
return tree
def leaves(self,tree):
"""Finds NP (nounphrase) leaf nodes of a chunk tree."""
for subtree in tree.subtrees(filter=lambda t: t.label() == 'NP'):
yield subtree.leaves()
def normalise(self,word):
"""Normalises words to lowercase and stems and lemmatizes it."""
word = word.lower()
# word = stemmer.stem(word)
word = lemmatizer.lemmatize(word)
return word
def contain_aspect(self):
\ No newline at end of file
def acceptable_word(self,word):
"""Checks conditions for acceptable word: length, stopword."""
accepted = bool(2 <= len(word) <= 40 and word.lower() not in stopwords)
return accepted
def get_terms(self,tree):
for leaf in self.leaves(tree):
term = " ".join([self.normalise(w) for w, t in leaf if self.acceptable_word(w)])
yield term
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment