import nltk from nltk.tokenize import word_tokenize from nltk.stem import WordNetLemmatizer from nltk.corpus import stopwords from nltk.corpus import wordnet import re lemmatizer = WordNetLemmatizer() stop_words = set(stopwords.words('english')) class Sentence(object): def clean_symbols(self,reviews,replace = ' '): """clean symbols and digits""" symbols = re.compile('[\s;\"\",.!?$\\/\[\]\{\}\(\)-]+') DIGITS = re.compile('\d+') clean_symbols = [] for review in reviews: review = symbols.sub(replace,review) review = DIGITS.sub(replace,review) clean_symbols.append(review) return clean_symbols def get_wordnet_pos(self,word): """get the pos of each word""" tag = nltk.pos_tag([word])[0][1][0].upper() if tag.startswith('J'): tag = wordnet.ADJ elif tag.startswith('V'): tag = wordnet.VERB elif tag.startswith('N'): tag = wordnet.NOUN elif tag.startswith('R'): tag = wordnet.ADV else: tag = None return tag def extract_aspect(self,clean_words,sent): """get noun in the sentence and lemmatize it for the extraction of top aspects (frequency) only """ aspect = [] for i in clean_words: word_token = word_tokenize(i.lower()) # lower case words rmv_stopwords = [i.strip() for i in word_token if i not in stop_words] # remove stop words word_lem = [lemmatizer.lemmatize(l, sent.get_wordnet_pos(l)) for l in rmv_stopwords if (sent.get_wordnet_pos(l) == 'n') & (len(l) > 3)] # retain noun words and words that has more than 3 characters aspect.append(word_lem) return aspect def words_cnt(self,clean_words): # count the frequency of each words count = 0 vocab_count = dict() for cw in clean_words: words = cw count += len(words) for w in words: if w in vocab_count: vocab_count[w] += 1 else: vocab_count[w] = 1 return vocab_count