A simple tutorial, learning about how to preprocess text
poem = "If you can keep your head when all about you,\nare losing theirs and blaming it on you.\nIf you can trust yourself when all men doubt you,\nbut make allowance for their doubting too.\nIf you can wait and not be tired by waiting,\nor being lied about, don’t deal in lies.\nOr being hated, don’t give way to hating,\nand yet don’t look too good, nor talk too wise:"
print(poem)
If you can keep your head when all about you, are losing theirs and blaming it on you. If you can trust yourself when all men doubt you, but make allowance for their doubting too. If you can wait and not be tired by waiting, or being lied about, don’t deal in lies. Or being hated, don’t give way to hating, and yet don’t look too good, nor talk too wise:
import re
poem_without_newlines = re.sub(r'\n', ' ', poem)
from nltk.tokenize import word_tokenize
tokenized1 = word_tokenize(poem_without_newlines)
print(tokenized1)
['If', 'you', 'can', 'keep', 'your', 'head', 'when', 'all', 'about', 'you', ',', 'are', 'losing', 'theirs', 'and', 'blaming', 'it', 'on', 'you', '.', 'If', 'you', 'can', 'trust', 'yourself', 'when', 'all', 'men', 'doubt', 'you', ',', 'but', 'make', 'allowance', 'for', 'their', 'doubting', 'too', '.', 'If', 'you', 'can', 'wait', 'and', 'not', 'be', 'tired', 'by', 'waiting', ',', 'or', 'being', 'lied', 'about', ',', 'don', '’', 't', 'deal', 'in', 'lies', '.', 'Or', 'being', 'hated', ',', 'don', '’', 't', 'give', 'way', 'to', 'hating', ',', 'and', 'yet', 'don', '’', 't', 'look', 'too', 'good', ',', 'nor', 'talk', 'too', 'wise', ':']
from nltk.tokenize import sent_tokenize
tokenized2 = sent_tokenize(poem_without_newlines)
print(tokenized2)
['If you can keep your head when all about you, are losing theirs and blaming it on you.', 'If you can trust yourself when all men doubt you, but make allowance for their doubting too.', 'If you can wait and not be tired by waiting, or being lied about, don’t deal in lies.', 'Or being hated, don’t give way to hating, and yet don’t look too good, nor talk too wise:']
poem_without_newlines_lowercase = poem_without_newlines.lower()
poem_without_newlines_uppercase = poem_without_newlines.upper()
Stopwords are words that we remove during preprocessing when we don’t care about sentence structure. They are usually the most common words in a language and don’t provide any information about the tone of a statement. They include words such as “a”, “an”, and “the”.
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
poem_no_stop = [word for word in tokenized1 if word not in stop_words]
print(poem_no_stop)
['If', 'keep', 'head', ',', 'losing', 'blaming', '.', 'If', 'trust', 'men', 'doubt', ',', 'make', 'allowance', 'doubting', '.', 'If', 'wait', 'tired', 'waiting', ',', 'lied', ',', '’', 'deal', 'lies', '.', 'Or', 'hated', ',', '’', 'give', 'way', 'hating', ',', 'yet', '’', 'look', 'good', ',', 'talk', 'wise', ':']
Stemming is concerned with bluntly removing word affixes (prefixes and suffixes), and is a common method used by search engines to improve matching between user input and website hits. For example, stemming would cast the word “going” to “go”.
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
stemmed = [stemmer.stem(token) for token in tokenized1]
print(stemmed)
['if', 'you', 'can', 'keep', 'your', 'head', 'when', 'all', 'about', 'you', ',', 'are', 'lose', 'their', 'and', 'blame', 'it', 'on', 'you', '.', 'if', 'you', 'can', 'trust', 'yourself', 'when', 'all', 'men', 'doubt', 'you', ',', 'but', 'make', 'allow', 'for', 'their', 'doubt', 'too', '.', 'if', 'you', 'can', 'wait', 'and', 'not', 'be', 'tire', 'by', 'wait', ',', 'or', 'be', 'lie', 'about', ',', 'don', '’', 't', 'deal', 'in', 'lie', '.', 'or', 'be', 'hate', ',', 'don', '’', 't', 'give', 'way', 'to', 'hate', ',', 'and', 'yet', 'don', '’', 't', 'look', 'too', 'good', ',', 'nor', 'talk', 'too', 'wise', ':']
Lemmatization is a method for casting words to their root forms.
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
lemmatized = [lemmatizer.lemmatize(token) for token in tokenized1]
print(lemmatized)
['If', 'you', 'can', 'keep', 'your', 'head', 'when', 'all', 'about', 'you', ',', 'are', 'losing', 'theirs', 'and', 'blaming', 'it', 'on', 'you', '.', 'If', 'you', 'can', 'trust', 'yourself', 'when', 'all', 'men', 'doubt', 'you', ',', 'but', 'make', 'allowance', 'for', 'their', 'doubting', 'too', '.', 'If', 'you', 'can', 'wait', 'and', 'not', 'be', 'tired', 'by', 'waiting', ',', 'or', 'being', 'lied', 'about', ',', 'don', '’', 't', 'deal', 'in', 'lie', '.', 'Or', 'being', 'hated', ',', 'don', '’', 't', 'give', 'way', 'to', 'hating', ',', 'and', 'yet', 'don', '’', 't', 'look', 'too', 'good', ',', 'nor', 'talk', 'too', 'wise', ':']
The process of identifying and labeling the part of speech of words is known as part-of-speech tagging
from nltk import pos_tag
poem_pos_tag = pos_tag(tokenized1)
print(poem_pos_tag)
[('If', 'IN'), ('you', 'PRP'), ('can', 'MD'), ('keep', 'VB'), ('your', 'PRP$'), ('head', 'NN'), ('when', 'WRB'), ('all', 'DT'), ('about', 'IN'), ('you', 'PRP'), (',', ','), ('are', 'VBP'), ('losing', 'VBG'), ('theirs', 'NNS'), ('and', 'CC'), ('blaming', 'VBG'), ('it', 'PRP'), ('on', 'IN'), ('you', 'PRP'), ('.', '.'), ('If', 'IN'), ('you', 'PRP'), ('can', 'MD'), ('trust', 'VB'), ('yourself', 'PRP'), ('when', 'WRB'), ('all', 'DT'), ('men', 'NNS'), ('doubt', 'VBP'), ('you', 'PRP'), (',', ','), ('but', 'CC'), ('make', 'VBP'), ('allowance', 'NN'), ('for', 'IN'), ('their', 'PRP$'), ('doubting', 'NN'), ('too', 'RB'), ('.', '.'), ('If', 'IN'), ('you', 'PRP'), ('can', 'MD'), ('wait', 'VB'), ('and', 'CC'), ('not', 'RB'), ('be', 'VB'), ('tired', 'VBN'), ('by', 'IN'), ('waiting', 'VBG'), (',', ','), ('or', 'CC'), ('being', 'VBG'), ('lied', 'VBN'), ('about', 'IN'), (',', ','), ('don', 'VB'), ('’', 'NNP'), ('t', 'JJ'), ('deal', 'NN'), ('in', 'IN'), ('lies', 'NNS'), ('.', '.'), ('Or', 'CC'), ('being', 'VBG'), ('hated', 'VBN'), (',', ','), ('don', 'VB'), ('’', 'JJ'), ('t', 'NNS'), ('give', 'VBP'), ('way', 'NN'), ('to', 'TO'), ('hating', 'NN'), (',', ','), ('and', 'CC'), ('yet', 'RB'), ('don', 'VB'), ('’', 'JJ'), ('t', 'JJ'), ('look', 'NN'), ('too', 'RB'), ('good', 'JJ'), (',', ','), ('nor', 'CC'), ('talk', 'VB'), ('too', 'RB'), ('wise', 'NN'), (':', ':')]