Replace sentence.py

17255091 · 20200203063 · cea76185 · 17255091
Commit 17255091 authored Sep 01, 2020 by 20200203063
Hide whitespace changes
Inline Side-by-side

Showing with 65 additions and 13 deletions

sentence.py
+65 -13

No files found.
--- a/sentence.py
+++ b/sentence.py
-
+# encoding: utf-8
+import nltk
+from nltk.corpus import stopwords
+stopwords = stopwords.words('english')
+# stopwords = []
+grammar = r"""
+ NBAR:
+	{<NN.*|JJ>*<NN.*>}  # Nouns and Adjectives, terminated with Nouns
+ NP:
+	{<NBAR>}
+	{<NBAR><IN><NBAR>}  # Above, connected with in/of/etc...
+"""
+lemmatizer = nltk.WordNetLemmatizer()
+#stemmer = nltk.stem.porter.PorterStemmer()


 class Sentence(object):
 	
-	WORD_TOKENIZER = MyPottsTokenizer(preserve_case=False)
+	# WORD_TOKENIZER = MyPottsTokenizer(preserve_case=False)
 	
-	LEMMATIZER = WordNetLemmatizer()
+	# LEMMATIZER = WordNetLemmatizer()

 	# 针对于每一句话抽取aspects
-	ASP_EXTRACTOR = 
+	# ASP_EXTRACTOR = 

-	def __init__(self):
+	def __init__(self,sentence):
+		self.sentence = sentence
 		

-	def word_tokenize(self):
+	# def word_tokenize(self):
+	# 	return
+
+	# def pos_tag(self):
+	# 	return
+
+	# def lemmatize(self):
+	# 	return
+
+	# def contain_aspect(self):
+	# 	return
 	
+	def extract_noun_phrase(self):
+		tree = self.execute()
+		nps = [np for np in self.get_terms(tree)]
+		return nps

-	def pos_tag(self):
-		

-	def lemmatize(self):
-		
+	def execute(self):
+		# Taken from Su Nam Kim Paper...
+		chunker = nltk.RegexpParser(grammar)
+		#toks = nltk.regexp_tokenize(text, sentence_re)
+		# #postoks = nltk.tag.pos_tag(toks)
+		toks = nltk.word_tokenize(self.sentence)
+		postoks = nltk.tag.pos_tag(toks)
+		tree = chunker.parse(postoks)
+		return tree
+
+	def leaves(self,tree):
+		"""Finds NP (nounphrase) leaf nodes of a chunk tree."""
+		for subtree in tree.subtrees(filter=lambda t: t.label() == 'NP'):
+			yield subtree.leaves()
+
+	def normalise(self,word):
+		"""Normalises words to lowercase and stems and lemmatizes it."""
+		word = word.lower()
+		# word = stemmer.stem(word)
+		word = lemmatizer.lemmatize(word)
+		return word

-	def contain_aspect(self):
-		
\ No newline at end of file
+	def acceptable_word(self,word):
+		"""Checks conditions for acceptable word: length, stopword."""
+		accepted = bool(2 <= len(word) <= 40 and word.lower() not in stopwords)
+		return accepted
+
+	def get_terms(self,tree):
+		for leaf in self.leaves(tree):
+			term = " ".join([self.normalise(w) for w, t in leaf if self.acceptable_word(w)])
+		yield term
+