import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords
from nltk.corpus import wordnet
import re
  
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

class Sentence(object):

    def clean_symbols(self,reviews,replace = ' '):
        """clean symbols and digits"""
        symbols = re.compile('[\s;\"\",.!?$\\/\[\]\{\}\(\)-]+')
        DIGITS = re.compile('\d+')
        clean_symbols = []
        for review in reviews:
            review = symbols.sub(replace,review)
            review = DIGITS.sub(replace,review)
            clean_symbols.append(review)
  
        return clean_symbols


    def get_wordnet_pos(self,word):
        """get the pos of each word"""
        
        tag = nltk.pos_tag([word])[0][1][0].upper()

        if tag.startswith('J'): 
            tag = wordnet.ADJ 
        elif tag.startswith('V'): 
            tag = wordnet.VERB 
        elif tag.startswith('N'): 
            tag = wordnet.NOUN 
        elif tag.startswith('R'): 
            tag = wordnet.ADV 
        else:           
            tag = None

        return tag        


    def extract_aspect(self,clean_words,sent):
        """get noun in the sentence and lemmatize it for the extraction of top aspects (frequency) only """ 
        aspect = []
        for i in clean_words:
            word_token = word_tokenize(i.lower()) # lower case words
            rmv_stopwords = [i.strip() for i in word_token if i not in stop_words] # remove stop words
            word_lem = [lemmatizer.lemmatize(l, sent.get_wordnet_pos(l)) for l in rmv_stopwords if (sent.get_wordnet_pos(l) == 'n') & (len(l) > 3)] # retain noun words and words that has more than 3 characters
            aspect.append(word_lem)
        return aspect

    def words_cnt(self,clean_words):
        # count the frequency of each words
        count = 0
        vocab_count = dict()
        for cw in clean_words:
            words = cw
            count += len(words)
            for w in words:
                if w in vocab_count:
                    vocab_count[w] += 1
                else:
                    vocab_count[w] = 1
        return vocab_count