spaCy is a free, open-source library for advanced Natural Language Processing (NLP) in Python. This post explains how to use spaCy for tokenization.
if spaCy is not installed, follow install spaCy link.
import spacy
model = spacy.load("en_core_web_sm")
text = "This post is published on gcptutorials.com"
tokens = model.tokenizer(text)
#print(help(tokens[0].lemma))
for token in tokens:
print(token.text)
'''Output
This
post
is
published
on
gcptutorials.com
'''
import spacy
model = spacy.load("en_core_web_sm")
text = "This post is published on gcptutorials.com"
tokens = model.tokenizer(text)
#print(help(tokens[0].lemma))
for token in tokens:
print(token.text, token.is_stop)
'''Output
This True
post False
is True
published False
on True
gcptutorials.com False
'''
import spacy
model = spacy.load("en_core_web_sm")
text = "This post is published on gcptutorials.com"
tokens = model.tokenizer(text)
#print(help(tokens[0].lemma))
for token in tokens:
print(token.text, token.is_alpha)
'''Output
This True
post True
is True
published True
on True
gcptutorials.com False
'''
Category: Python