-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathnltk_utils.py
More file actions
33 lines (25 loc) · 852 Bytes
/
nltk_utils.py
File metadata and controls
33 lines (25 loc) · 852 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
import nltk
from nltk.stem.porter import PorterStemmer
import numpy as np
nltk.download('punkt')
nltk.download('punkt_tab')
#First step
def tokenize(sentence):
return nltk.word_tokenize(sentence) # split sentence into words
#Second step
stemmer = PorterStemmer()
def stem(word):
return stemmer.stem(word.lower()) # convert word to lower case and find the root word
#Third step
def bag_of_words(tokenized_sentence, words):
"""
sentence = ["hello", "how", "are", "you"]
words = ["hi", "hello", "I", "you", "bye", "thank", "cool"]
bag = [ 0, 1, 0, 1, 0, 0, 0]
"""
sentence_words = [stem(word) for word in tokenized_sentence]
bag = np.zeros(len(words), dtype=np.float32)
for idx, w in enumerate(words):
if w in sentence_words:
bag[idx] = 1.0
return bag