Code
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
# Download required data (run once)
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('averaged_perceptron_tagger')
text = """Natural language processing (NLP) is a fascinating field.
It enables computers to understand and process human language.
NLTK provides excellent tools for learning NLP concepts."""
# Tokenization
sentences = sent_tokenize(text)
words = word_tokenize(text)
print("Sentences:", len(sentences))
print("Words:", len(words))
print("\nFirst sentence tokens:", word_tokenize(sentences[0]))
# Remove stopwords
stop_words = set(stopwords.words('english'))
filtered_words = [w for w in words if w.lower() not in stop_words and w.isalpha()]
print("\nFiltered words:", filtered_words)
# Stemming vs Lemmatization
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
words_to_process = ['running', 'runs', 'ran', 'easily', 'fairly']
print("\n{:<15} {:<15} {:<15}".format("Original", "Stemmed", "Lemmatized"))
print("-" * 45)
for word in words_to_process:
stemmed = stemmer.stem(word)
lemmatized = lemmatizer.lemmatize(word, pos='v') # v = verb
print("{:<15} {:<15} {:<15}".format(word, stemmed, lemmatized))Sentences: 3
Words: 30
First sentence tokens: ['Natural', 'language', 'processing', '(', 'NLP', ')', 'is', 'a', 'fascinating', 'field', '.']
Filtered words: ['Natural', 'language', 'processing', 'NLP', 'fascinating', 'field', 'enables', 'computers', 'understand', 'process', 'human', 'language', 'NLTK', 'provides', 'excellent', 'tools', 'learning', 'NLP', 'concepts']
Original Stemmed Lemmatized
---------------------------------------------
running run run
runs run run
ran ran run
easily easili easily
fairly fairli fairly