Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
P
Project3
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
20200203063
Project3
Commits
17255091
Commit
17255091
authored
Sep 01, 2020
by
20200203063
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Replace sentence.py
parent
cea76185
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
65 additions
and
13 deletions
+65
-13
sentence.py
+65
-13
No files found.
sentence.py
View file @
17255091
# encoding: utf-8
import
nltk
from
nltk.corpus
import
stopwords
stopwords
=
stopwords
.
words
(
'english'
)
# stopwords = []
grammar
=
r"""
NBAR:
{<NN.*|JJ>*<NN.*>} # Nouns and Adjectives, terminated with Nouns
NP:
{<NBAR>}
{<NBAR><IN><NBAR>} # Above, connected with in/of/etc...
"""
lemmatizer
=
nltk
.
WordNetLemmatizer
()
#stemmer = nltk.stem.porter.PorterStemmer()
class
Sentence
(
object
):
class
Sentence
(
object
):
WORD_TOKENIZER
=
MyPottsTokenizer
(
preserve_case
=
False
)
#
WORD_TOKENIZER = MyPottsTokenizer(preserve_case=False)
LEMMATIZER
=
WordNetLemmatizer
()
#
LEMMATIZER = WordNetLemmatizer()
# 针对于每一句话抽取aspects
# 针对于每一句话抽取aspects
ASP_EXTRACTOR
=
#
ASP_EXTRACTOR =
def
__init__
(
self
):
def
__init__
(
self
,
sentence
):
self
.
sentence
=
sentence
def
word_tokenize
(
self
):
# def word_tokenize(self):
# return
# def pos_tag(self):
# return
# def lemmatize(self):
# return
# def contain_aspect(self):
# return
def
extract_noun_phrase
(
self
):
tree
=
self
.
execute
()
nps
=
[
np
for
np
in
self
.
get_terms
(
tree
)]
return
nps
def
pos_tag
(
self
):
def
lemmatize
(
self
):
def
execute
(
self
):
# Taken from Su Nam Kim Paper...
chunker
=
nltk
.
RegexpParser
(
grammar
)
#toks = nltk.regexp_tokenize(text, sentence_re)
# #postoks = nltk.tag.pos_tag(toks)
toks
=
nltk
.
word_tokenize
(
self
.
sentence
)
postoks
=
nltk
.
tag
.
pos_tag
(
toks
)
tree
=
chunker
.
parse
(
postoks
)
return
tree
def
leaves
(
self
,
tree
):
"""Finds NP (nounphrase) leaf nodes of a chunk tree."""
for
subtree
in
tree
.
subtrees
(
filter
=
lambda
t
:
t
.
label
()
==
'NP'
):
yield
subtree
.
leaves
()
def
normalise
(
self
,
word
):
"""Normalises words to lowercase and stems and lemmatizes it."""
word
=
word
.
lower
()
# word = stemmer.stem(word)
word
=
lemmatizer
.
lemmatize
(
word
)
return
word
def
contain_aspect
(
self
):
def
acceptable_word
(
self
,
word
):
"""Checks conditions for acceptable word: length, stopword."""
\ No newline at end of file
accepted
=
bool
(
2
<=
len
(
word
)
<=
40
and
word
.
lower
()
not
in
stopwords
)
return
accepted
def
get_terms
(
self
,
tree
):
for
leaf
in
self
.
leaves
(
tree
):
term
=
" "
.
join
([
self
.
normalise
(
w
)
for
w
,
t
in
leaf
if
self
.
acceptable_word
(
w
)])
yield
term
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment