In [None]:
!pip install emoji
!pip install pyspellchecker
!pip install contractions

In [1]:
corpus = [
    "I can't wait for the new season of my favorite show!",
    "The COVID-19 pandemic has affected millions of people worldwide.",
    "U.S. stocks fell on Friday after news of rising inflation.",
    "<html><body>Welcome to the website!</body></html>",
    "Python is a great programming language!!! ??"
]


In [5]:
import re
import string
from bs4 import BeautifulSoup

def clean_text(text):
    text = text.lower()  # Lowercase
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    text = re.sub(r'\W', ' ', text)  # Remove special characters
    text = BeautifulSoup(text, "html.parser").get_text()  # Remove HTML tags
    return text

cleaned_corpus = [clean_text(doc) for doc in corpus]
print(cleaned_corpus)


['i cant wait for the new season of my favorite show', 'the covid pandemic has affected millions of people worldwide', 'us stocks fell on friday after news of rising inflation', 'htmlbodywelcome to the websitebodyhtml', 'python is a great programming language ']


In [7]:
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt_tab')

tokenized_corpus = [word_tokenize(doc) for doc in cleaned_corpus]
print(tokenized_corpus)

[['i', 'cant', 'wait', 'for', 'the', 'new', 'season', 'of', 'my', 'favorite', 'show'], ['the', 'covid', 'pandemic', 'has', 'affected', 'millions', 'of', 'people', 'worldwide'], ['us', 'stocks', 'fell', 'on', 'friday', 'after', 'news', 'of', 'rising', 'inflation'], ['htmlbodywelcome', 'to', 'the', 'websitebodyhtml'], ['python', 'is', 'a', 'great', 'programming', 'language']]


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [8]:
from nltk.corpus import stopwords
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))
filtered_corpus = [[word for word in doc if word not in stop_words] for doc in tokenized_corpus]
print(filtered_corpus)

[nltk_data] Downloading package stopwords to /root/nltk_data...


[['cant', 'wait', 'new', 'season', 'favorite', 'show'], ['covid', 'pandemic', 'affected', 'millions', 'people', 'worldwide'], ['us', 'stocks', 'fell', 'friday', 'news', 'rising', 'inflation'], ['htmlbodywelcome', 'websitebodyhtml'], ['python', 'great', 'programming', 'language']]


[nltk_data]   Unzipping corpora/stopwords.zip.


In [9]:
from nltk.stem import PorterStemmer, WordNetLemmatizer
nltk.download('wordnet')

stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

stemmed_corpus = [[stemmer.stem(word) for word in doc] for doc in filtered_corpus]
lemmatized_corpus = [[lemmatizer.lemmatize(word) for word in doc] for doc in filtered_corpus]
print(stemmed_corpus)
print(lemmatized_corpus)

[nltk_data] Downloading package wordnet to /root/nltk_data...


[['cant', 'wait', 'new', 'season', 'favorit', 'show'], ['covid', 'pandem', 'affect', 'million', 'peopl', 'worldwid'], ['us', 'stock', 'fell', 'friday', 'news', 'rise', 'inflat'], ['htmlbodywelcom', 'websitebodyhtml'], ['python', 'great', 'program', 'languag']]
[['cant', 'wait', 'new', 'season', 'favorite', 'show'], ['covid', 'pandemic', 'affected', 'million', 'people', 'worldwide'], ['u', 'stock', 'fell', 'friday', 'news', 'rising', 'inflation'], ['htmlbodywelcome', 'websitebodyhtml'], ['python', 'great', 'programming', 'language']]


In [12]:
import contractions

expanded_corpus = [contractions.fix(doc) for doc in cleaned_corpus]
print(expanded_corpus)

['i cannot wait for the new season of my favorite show', 'the covid pandemic has affected millions of people worldwide', 'us stocks fell on friday after news of rising inflation', 'htmlbodywelcome to the websitebodyhtml', 'python is a great programming language ']


In [15]:
import emoji

emoji_corpus = [emoji.demojize(doc) for doc in cleaned_corpus]
print(emoji_corpus)

['i cant wait for the new season of my favorite show', 'the covid pandemic has affected millions of people worldwide', 'us stocks fell on friday after news of rising inflation', 'htmlbodywelcome to the websitebodyhtml', 'python is a great programming language ']


In [18]:
from spellchecker import SpellChecker

spell = SpellChecker()
corrected_corpus = [[spell.correction(word) for word in doc] for doc in tokenized_corpus]
print(corrected_corpus)

[['i', 'cant', 'wait', 'for', 'the', 'new', 'season', 'of', 'my', 'favorite', 'show'], ['the', 'bovid', 'pandemic', 'has', 'affected', 'millions', 'of', 'people', 'worldwide'], ['us', 'stocks', 'fell', 'on', 'friday', 'after', 'news', 'of', 'rising', 'inflation'], [None, 'to', 'the', None], ['python', 'is', 'a', 'great', 'programming', 'language']]
