import sys
sys.path.append('/Users/poudel/opt/miniconda3/envs/nlp/lib/python3.7/site-packages')


import numpy as np
import pandas as pd
pd.set_option('max_colwidth',1000)
pd.set_option('max_columns',100)
pd.set_option('max_rows',50)

#========= NLP
import re
import string
import unidecode
import wordninja
import nltk
import spacy
import textblob
import gensim
import texthero
from urllib.parse import urlparse
from nltk.corpus import stopwords

print([(x.__name__,x.__version__) for x in [nltk,spacy,textblob,gensim]])

#=======OTHERS
import scipy
import multiprocessing as mp

[('nltk', '3.4.4'), ('spacy', '2.2.3'), ('textblob', '0.15.3'), ('gensim', '3.8.3')]


df_train_raw = pd.read_csv('../data/raw/train.csv',nrows=50)
df_test_raw = pd.read_csv('../data/raw/test.csv',nrows=50)

df = df_train_raw.append(df_test_raw)
df = df.reset_index()

print(f"shape df_train_raw: {df_train_raw.shape}")
print(f"shape df_test_raw: {df_test_raw.shape}")

df.head(2).append(df.tail(2))

target = 'label'
maincol = 'tweet'
mc = maincol + '_clean'
mcl = maincol + '_lst_clean'
mce = mc + '_emoji'
mcle = mcl + '_emoji'

shape df_train_raw: (50, 3)
shape df_test_raw: (50, 2)


import multiprocessing as mp
def parallelize_dataframe(df, func):
    ncores = mp.cpu_count()
    df_split = np.array_split(df, ncores)
    pool = mp.Pool(ncores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df


from urllib.parse import urlparse
def is_url(url):
    try:
        result = urlparse(url)
        return all([result.scheme, result.netloc])
    except ValueError:
        return False


def process_text(text):
    """
    Do a basic text processing.

    Parameters
    -----------
    text : string

    Returns
    --------
    This function returns pandas series having one list
    with clean text.
    1: split combined text
    2: lowercase
    3: expand apostrophes
    4: remove punctuation
    5: remove digits
    6: remove repeated substring
    7: remove stop words
    8: lemmatize

    Example:
    ========
    import re
    import string
    from nltk.corpus import stopwords
    import nltk
    
    text = "I'm typing text2num! areYou ? If yesyes say yes pals!"
    process_text(text)
    # ['typing', 'textnum', 'yes', 'say', 'yes', 'pal']

    """
    s = pd.Series([text])
    
    # step: Split combined words areYou ==> are You
    #s = s.apply(lambda x: re.sub(r'([a-z])([A-Z])',r'\1 \2',x))

    # step: decode unicode characters
    s = s.apply(unidecode.unidecode)
    
    # step: lowercase
    s = s.str.lower()
    
    # step: remove ellipsis
    #s = s.str.replace(r'(\w)\u2026+',r'\1',regex=True)
    s = s.str.replace(r'…+',r'')

    # step: remove url
    #s = s.str.replace('http\S+|www.\S+', '', case=False)
    s = pd.Series([' '.join(y for y in x.split() if not is_url(y)) for x in s])

    # step: expand apostrophes
    map_apos = {
        "you're": 'you are',
        "i'm": 'i am',
        "he's": 'he is',
        "she's": 'she is',
        "it's": 'it is',
        "they're": 'they are',
        "can't": 'can not',
        "couldn't": 'could not',
        "don't": 'do not',
        "don;t": 'do not',
        "didn't": 'did not',
        "doesn't": 'does not',
        "isn't": 'is not',
        "wasn't": 'was not',
        "aren't": 'are not',
        "weren't": 'were not',
        "won't": 'will not',
        "wouldn't": 'would not',
        "hasn't": 'has not',
        "haven't": 'have not',
        "what's": 'what is',
        "that's": 'that is',
    }

    sa = pd.Series(s.str.split()[0])
    sb = sa.map(map_apos).fillna(sa)
    sentence = sb.str.cat(sep=' ')
    s = pd.Series([sentence])
    
    # step: expand shortcuts
    shortcuts = {'<3': 'love',
                 'awsm': 'awesome',
                 'b4': 'before',
                 'bc': 'because',
                 'bday': 'birthday',
                 'dm': 'direct message',
                 'doin': 'doing',
                 'gr8': 'great',
                 'gud': 'good',
                 'h8': 'hate',
                 'hw': 'how',
                 'idc': 'i do not care',
                 'idgaf': 'hate',
                 'irl': 'in real life',
                 'k': 'okay',
                 'lv': 'love',
                 'm': 'am',
                 'r': 'are',
                 'rt': 'retweet',
                 'ttyl': 'talk to you later',
                 'ty': 'thank you',
                 'u': 'you',
                 'wlcm': 'welcome',
                 'wtf': 'hate',
                 'xoxo': 'love',
                 'y': 'why',
                 'yolo': 'you only live once'}

    sa = pd.Series(s.str.split()[0])
    sb = sa.map(shortcuts).fillna(sa)
    sentence = sb.str.cat(sep=' ')
    s = pd.Series([sentence])

    # step: remove punctuation
    s = s.str.translate(str.maketrans(' ',' ',
                                        string.punctuation))
    # step: remove digits
    s = s.str.translate(str.maketrans(' ', ' ', '\n'))
    s = s.str.translate(str.maketrans(' ', ' ', string.digits))

    # step: remove repeated substring yesyes ==> yes
    s = s.str.replace(r'(\w+)\1',r'\1',regex=True)

    # step: remove stop words
    stop = set(stopwords.words('English'))
    extra_stop_words = ['...']
    stop.update(extra_stop_words) # inplace operation
    s = s.str.split()
    s = s.apply(lambda x: [I for I in x if I not in stop])

    # step: convert word to base form or lemmatize
    lemmatizer = nltk.stem.WordNetLemmatizer()
    s = s.apply(lambda lst: [lemmatizer.lemmatize(word) 
                               for word in lst])

    return s.to_numpy()[0]

text = "rt text2num! yesyes gud www.xy.com amazing"
process_text(text)

['retwet', 'textnum', 'yes', 'god', 'wwxycom', 'amazing']


def add_features(df):
    df[mcl] = df[maincol].apply(process_text)
    df[mc] = df[mcl].str.join(' ')
    df['hashtags_lst'] = df[maincol].str.findall(r'#.*?(?=\s|$)')
    
    #df['hastags'] = df[mc].apply(lambda x: len([x for x in x.split() if x.startswith('#')]))
    
    df['hashtags'] = df['hashtags_lst'].str.join(' ')

    return df


%%time
df = parallelize_dataframe(df, add_features)

CPU times: user 17.5 ms, sys: 20.8 ms, total: 38.3 ms
Wall time: 491 ms


df.head()


%run emoticons.py


%run emojis.py


def convert_emoticons(text):
    for emot in EMOTICONS:
        text = re.sub(u'('+emot+')', "_".join(EMOTICONS[emot].replace(",","").split()), text)
    return text

text1 = "Hello :-) :-)"
text2 = "Thanks to my uncle :) #yay"

print(convert_emoticons(text1))
print(convert_emoticons(text2))

Hello Happy_face_smiley Happy_face_smiley
Thanks to my uncle Happy_face_or_smiley #yay


def convert_emojis(text):
    for emot in UNICODE_EMO:
        text = re.sub(r'('+emot+')', "_".join(UNICODE_EMO[emot].replace(",","").replace(":","").split()), text)
    return text

text = "game is on 🔥"
convert_emojis(text)

'game is on fire'


def process_text_emoji(text):
    """
    Do a basic text processing.

    Parameters
    -----------
    text : string
        
    Returns
    --------
    This function returns pandas series having one list
    with clean text.
    1: split combined text
    2: lowercase
    3: expand apostrophes
    4: remove punctuation
    5: remove digits
    6: remove repeated substring
    7: remove stop words
    8: lemmatize

    Example:
    ========
    import re
    import string
    from nltk.corpus import stopwords
    import nltk
    
    text = "I'm typing text2num! areYou ? If yesyes say yes pals!"
    process_text(text)
    # ['typing', 'textnum', 'yes', 'say', 'yes', 'pal']

    """
    s = pd.Series([text])
    
    # step: decode unicode characters
    s = s.apply(unidecode.unidecode)
    
    # step: howareyou ==> how are you
    s = s.apply(lambda x: ' '.join(wordninja.split(x)))
    
    # step: expand emoticons and emojis
    s = s.apply(convert_emoticons)
    s = s.apply(convert_emojis)

    # step: Split combined words areYou ==> are You
    #s = s.apply(lambda x: re.sub(r'([a-z])([A-Z])',r'\1 \2',x))

    # step: lowercase
    s = s.str.lower()
    
    # step: remove ellipsis
    #s = s.str.replace(r'(\w)\u2026+',r'\1',regex=True)
    s = s.str.replace(r'…+',r'')

    # step: remove url
    #s = s.str.replace('http\S+|www.\S+', '', case=False)
    s = pd.Series([' '.join(y for y in x.split() if not is_url(y)) for x in s])

    # step: expand apostrophes
    map_apos = {
        "you're": 'you are',
        "i'm": 'i am',
        "he's": 'he is',
        "she's": 'she is',
        "it's": 'it is',
        "they're": 'they are',
        "can't": 'can not',
        "couldn't": 'could not',
        "don't": 'do not',
        "don;t": 'do not',
        "didn't": 'did not',
        "doesn't": 'does not',
        "isn't": 'is not',
        "wasn't": 'was not',
        "aren't": 'are not',
        "weren't": 'were not',
        "won't": 'will not',
        "wouldn't": 'would not',
        "hasn't": 'has not',
        "haven't": 'have not',
        "what's": 'what is',
        "that's": 'that is',
    }

    sa = pd.Series(s.str.split()[0])
    sb = sa.map(map_apos).fillna(sa)
    sentence = sb.str.cat(sep=' ')
    s = pd.Series([sentence])
    
    # step: expand shortcuts
    shortcuts = {'<3': 'love',
                 'awsm': 'awesome',
                 'b4': 'before',
                 'bc': 'because',
                 'bday': 'birthday',
                 'dm': 'direct message',
                 'doin': 'doing',
                 'gr8': 'great',
                 'gud': 'good',
                 'h8': 'hate',
                 'hw': 'how',
                 'idc': 'i do not care',
                 'idgaf': 'hate',
                 'irl': 'in real life',
                 'k': 'okay',
                 'lv': 'love',
                 'm': 'am',
                 'r': 'are',
                 'rt': 'retweet',
                 'ttyl': 'talk to you later',
                 'ty': 'thank you',
                 'u': 'you',
                 'wlcm': 'welcome',
                 'wtf': 'hate',
                 'xoxo': 'love',
                 'y': 'why',
                 'yolo': 'you only live once'}

    sa = pd.Series(s.str.split()[0])
    sb = sa.map(shortcuts).fillna(sa)
    sentence = sb.str.cat(sep=' ')
    s = pd.Series([sentence])

    # step: remove punctuation
    s = s.str.translate(str.maketrans(' ',' ',
                                        string.punctuation))
    # step: remove digits
    s = s.str.translate(str.maketrans(' ', ' ', '\n'))
    s = s.str.translate(str.maketrans(' ', ' ', string.digits))

    # step: remove repeated substring yesyes ==> yes
    s = s.str.replace(r'(\w+)\1',r'\1',regex=True)

    # step: remove stop words
    stop = set(stopwords.words('English'))
    extra_stop_words = ['...']
    stop.update(extra_stop_words) # inplace operation
    s = s.str.split()
    s = s.apply(lambda x: [I for I in x if I not in stop])

    # step: convert word to base form or lemmatize
    lemmatizer = nltk.stem.WordNetLemmatizer()
    s = s.apply(lambda lst: [lemmatizer.lemmatize(word) 
                               for word in lst])

    return s.to_numpy()[0]

text = "rt text2num! yesyes gud www.xy.com amazing"
process_text(text)

['retwet', 'textnum', 'yes', 'god', 'wwxycom', 'amazing']


def add_features_emoji(df):
    # we need to remove url first
    df[mcle] = df[maincol].str.replace('http\S+|www.\S+', '', case=False)
    df[mcle] = df[mcle].apply(process_text_emoji)
    df[mce] = df[mcle].str.join(' ')

    return df

add_features_emoji(df.head().copy())


%%time

# This takes long time for full data
df = parallelize_dataframe(df, add_features_emoji)

CPU times: user 18.9 ms, sys: 18.4 ms, total: 37.3 ms
Wall time: 5.26 s


df.head()


note = """
Look the clean tweet properly:
- look for url links
- look for ellipsis e.g. #sonyexperias…
- convert emoji and emoticons eg. :) with library emot
""";


def create_text_features(df):
    # total
    df['total_length'] = df[maincol].apply(len)

    # num of word and sentence
    df['num_words'] = df[maincol].apply(lambda x: len(x.split()))

    df['num_sent']=df[maincol].apply(lambda x: 
                                len(re.findall("\n",str(x)))+1)

    df['num_unique_words'] = df[maincol].apply(
        lambda x: len(set(w for w in x.split())))

    df["num_words_title"] = df[maincol].apply(lambda x: len([w for w in str(x).split() if w.istitle()]))

    df['num_uppercase'] = df[maincol].apply(
        lambda x: sum(1 for c in x if c.isupper()))

    # num of certain characters ! ? . @
    df['num_exclamation_marks'] = df[maincol].apply(lambda x: x.count('!'))

    df['num_question_marks'] = df[maincol].apply(lambda x: x.count('?'))

    df['num_punctuation'] = df[maincol].apply(
        lambda x: sum(x.count(w) for w in '.,;:'))

    df['num_symbols'] = df[maincol].apply(
        lambda x: sum(x.count(w) for w in '*&$%'))
    
    df['num_digits'] = df[maincol].apply(lambda x: len([x for x in x.split() if x.isdigit()]))

    # average
    df["avg_word_len"] = df[maincol].apply(lambda x: np.mean([len(w) for w in str(x).split()]))
    
    df['avg_uppercase'] = df.apply(
        lambda row: float(row['num_uppercase'])/float(row['total_length']),
                                    axis=1)
    
    df['avg_unique'] = df['num_unique_words'] / df['num_words']
    
    return df


%%time
df = parallelize_dataframe(df, create_text_features)

CPU times: user 25.9 ms, sys: 22.1 ms, total: 48 ms
Wall time: 244 ms


df[[maincol,mcl,mcle]].tail()


arr_all_words = df[mcle].sum()
least_freq_words = np.unique(arr_all_words,return_counts=True)[0][-100:]
least_freq_words

array(['towards', 'trans', 'travel', 'tre', 'trending', 'trip', 'trols',
       'truth', 'tt', 'tv', 'twe', 'twet', 'tweted', 'twitch', 'twiter',
       'two', 'type', 'u', 'ude', 'ugh', 'ugo', 'uk', 'um', 'uncle',
       'unles', 'unplug', 'unresponsive', 'update', 'upgrade', 'ur',
       'urban', 'useles', 'v', 'vacation', 'verification', 'vertical',
       'vet', 'vg', 'vgr', 'vi', 'via', 'vibe', 'vidal', 'video', 'vien',
       'view', 'vocation', 'w', 'wag', 'waiter', 'waiting', 'wal',
       'walpaper', 'walpapers', 'want', 'wanting', 'watch', 'water',
       'way', 'weather', 'wek', 'weks', 'white', 'wi', 'wife', 'wil',
       'wild', 'window', 'wired', 'without', 'wods', 'woman', 'work',
       'worldwide', 'would', 'wq', 'writing', 'wtc', 'x', 'xbox', 'xf',
       'xp', 'yad', 'yay', 'year', 'yec', 'yelow', 'yes', 'young',
       'youtube', 'z', 'za', 'zand', 'zar', 'zeland', 'zi', 'zn', 'zp',
       'zu', 'zw'], dtype='<U13')


%%writefile sentiment_analysis_data_processing.py

# load the path
import sys
sys.path.append('/Users/poudel/opt/miniconda3/envs/nlp/lib/python3.7/site-packages')

# load the libraries
import numpy as np
import pandas as pd
import time
import re
import string
from urllib.parse import urlparse
import multiprocessing as mp
import nltk
from nltk.corpus import stopwords

import unidecode
import wordninja

time_start = time.time()

# Load the data
df_train_raw = pd.read_csv('../data/raw/train.csv')
df_test_raw = pd.read_csv('../data/raw/test.csv')
df = df_train_raw.append(df_test_raw)
df = df.reset_index()

# Variables
target = 'label'
maincol = 'tweet'
mc = maincol + '_clean'
mcl = maincol + '_lst_clean'
mce = mc + '_emoji'
mcle = mcl + '_emoji'

# ==================== Useful functions ==============
def parallelize_dataframe(df, func):
    ncores = mp.cpu_count()
    df_split = np.array_split(df, ncores)
    pool = mp.Pool(ncores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

def is_url(url):
    try:
        result = urlparse(url)
        return all([result.scheme, result.netloc])
    except ValueError:
        return False
    
#================== Text processing =================
def process_text(text):
    """
    Do a basic text processing.

    Parameters
    -----------
    text : string

    Returns
    --------
    This function returns pandas series having one list
    with clean text.
    1: split combined text
    2: lowercase
    3: expand apostrophes
    4: remove punctuation
    5: remove digits
    6: remove repeated substring
    7: remove stop words
    8: lemmatize

    Example:
    ========
    import re
    import string
    from nltk.corpus import stopwords
    import nltk
    
    text = "I'm typing text2num! areYou ? If yesyes say yes pals!"
    process_text(text)
    # ['typing', 'textnum', 'yes', 'say', 'yes', 'pal']

    """
    s = pd.Series([text])
    
    # step: Split combined words areYou ==> are You
    #s = s.apply(lambda x: re.sub(r'([a-z])([A-Z])',r'\1 \2',x))

    # step: lowercase
    s = s.str.lower()
    
    # step: remove ellipsis
    #s = s.str.replace(r'(\w)\u2026+',r'\1',regex=True)
    s = s.str.replace(r'…+',r'')

    # step: remove url
    #s = s.str.replace('http\S+|www.\S+', '', case=False)
    s = pd.Series([' '.join(y for y in x.split() if not is_url(y)) for x in s])

    # step: expand apostrophes
    map_apos = {
        "you're": 'you are',
        "i'm": 'i am',
        "he's": 'he is',
        "she's": 'she is',
        "it's": 'it is',
        "they're": 'they are',
        "can't": 'can not',
        "couldn't": 'could not',
        "don't": 'do not',
        "don;t": 'do not',
        "didn't": 'did not',
        "doesn't": 'does not',
        "isn't": 'is not',
        "wasn't": 'was not',
        "aren't": 'are not',
        "weren't": 'were not',
        "won't": 'will not',
        "wouldn't": 'would not',
        "hasn't": 'has not',
        "haven't": 'have not',
        "what's": 'what is',
        "that's": 'that is',
    }

    sa = pd.Series(s.str.split()[0])
    sb = sa.map(map_apos).fillna(sa)
    sentence = sb.str.cat(sep=' ')
    s = pd.Series([sentence])
    
    # step: expand shortcuts
    shortcuts = {'<3': 'love',
                 'awsm': 'awesome',
                 'b4': 'before',
                 'bc': 'because',
                 'bday': 'birthday',
                 'dm': 'direct message',
                 'doin': 'doing',
                 'gr8': 'great',
                 'gud': 'good',
                 'h8': 'hate',
                 'hw': 'how',
                 'idc': 'i do not care',
                 'idgaf': 'hate',
                 'irl': 'in real life',
                 'k': 'okay',
                 'lv': 'love',
                 'm': 'am',
                 'r': 'are',
                 'rt': 'retweet',
                 'ttyl': 'talk to you later',
                 'ty': 'thank you',
                 'u': 'you',
                 'wlcm': 'welcome',
                 'wtf': 'hate',
                 'xoxo': 'love',
                 'y': 'why',
                 'yolo': 'you only live once'}

    sa = pd.Series(s.str.split()[0])
    sb = sa.map(shortcuts).fillna(sa)
    sentence = sb.str.cat(sep=' ')
    s = pd.Series([sentence])

    # step: remove punctuation
    s = s.str.translate(str.maketrans(' ',' ',
                                        string.punctuation))
    # step: remove digits
    s = s.str.translate(str.maketrans(' ', ' ', '\n'))
    s = s.str.translate(str.maketrans(' ', ' ', string.digits))

    # step: remove repeated substring yesyes ==> yes
    s = s.str.replace(r'(\w+)\1',r'\1',regex=True)

    # step: remove stop words
    stop = set(stopwords.words('English'))
    extra_stop_words = ['...']
    stop.update(extra_stop_words) # inplace operation
    s = s.str.split()
    s = s.apply(lambda x: [I for I in x if I not in stop])

    # step: convert word to base form or lemmatize
    lemmatizer = nltk.stem.WordNetLemmatizer()
    s = s.apply(lambda lst: [lemmatizer.lemmatize(word) 
                               for word in lst])

    return s.to_numpy()[0]

def add_features(df):
    df[mcl] = df[maincol].apply(process_text)
    df[mc] = df[mcl].str.join(' ')
    df['hashtags_lst'] = df[maincol].str.findall(r'#.*?(?=\s|$)')
    
    #df['hashtags'] = df[maincol].apply(lambda x: len([x for x in x.split() if x.startswith('#')]))
    
    df['hashtags'] = df['hashtags_lst'].str.join(' ')

    return df

print("Creating clean tweet and hashtags ...")
df = parallelize_dataframe(df, add_features)

#======================= Text Feature Generation =====
def create_text_features(df):
    # total
    df['total_length'] = df[maincol].apply(len)

    # num of word and sentence
    df['num_words'] = df[maincol].apply(lambda x: len(x.split()))

    df['num_sent']=df[maincol].apply(lambda x: 
                                len(re.findall("\n",str(x)))+1)

    df['num_unique_words'] = df[maincol].apply(
        lambda x: len(set(w for w in x.split())))

    df["num_words_title"] = df[maincol].apply(lambda x: len([w for w in str(x).split() if w.istitle()]))

    df['num_uppercase'] = df[maincol].apply(
        lambda x: sum(1 for c in x if c.isupper()))

    # num of certain characters ! ? . @
    df['num_exclamation_marks'] = df[maincol].apply(lambda x: x.count('!'))

    df['num_question_marks'] = df[maincol].apply(lambda x: x.count('?'))

    df['num_punctuation'] = df[maincol].apply(
        lambda x: sum(x.count(w) for w in '.,;:'))

    df['num_symbols'] = df[maincol].apply(
        lambda x: sum(x.count(w) for w in '*&$%'))
    
    df['num_digits'] = df[maincol].apply(lambda x: len([x for x in x.split() if x.isdigit()]))

    # average
    df["avg_word_len"] = df[maincol].apply(lambda x: np.mean([len(w) for w in str(x).split()]))
    
    df['avg_uppercase'] = df.apply(
        lambda row: float(row['num_uppercase'])/float(row['total_length']),
                                    axis=1)

    df['avg_unique'] = df['num_unique_words'] / df['num_words']
    
    return df

print("Adding Text features ...")
df = parallelize_dataframe(df, create_text_features)

#===================== Emoticons =====================
from emoticons.py import *
def convert_emoticons(text):
    for emot in EMOTICONS:
        text = re.sub(u'('+emot+')', "_".join(EMOTICONS[emot].replace(",","").split()), text)
    return text

#===================== Save clean data =========================
df.to_csv('../data/processed/df_combined_clean.csv',index=False)

time_taken = time.time() - time_start
m,s = divmod(time_taken,60)
print(f"Data cleaning finished in {m} min {s:.2f} sec.")

Overwriting sentiment_analysis_data_processing.py


%%writefile sentiment_analysis_data_processing.py

# load the path
import sys
sys.path.append('/Users/poudel/opt/miniconda3/envs/nlp/lib/python3.7/site-packages')

# load the libraries
import numpy as np
import pandas as pd
import time
import re
import string
from urllib.parse import urlparse
import multiprocessing as mp
import nltk
from nltk.corpus import stopwords

import unidecode
import wordninja

time_start = time.time()

# Load the data
df_train_raw = pd.read_csv('../data/raw/train.csv')
df_test_raw = pd.read_csv('../data/raw/test.csv')
df = df_train_raw.append(df_test_raw)
df = df.reset_index()

# Variables
target = 'label'
maincol = 'tweet'
mc = maincol + '_clean'
mcl = maincol + '_lst_clean'
mce = mc + '_emoji'
mcle = mcl + '_emoji'

# ==================== Useful functions ==============
def parallelize_dataframe(df, func):
    ncores = mp.cpu_count()
    df_split = np.array_split(df, ncores)
    pool = mp.Pool(ncores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

def is_url(url):
    try:
        result = urlparse(url)
        return all([result.scheme, result.netloc])
    except ValueError:
        return False
    
#================== Text processing =================
def process_text(text):
    """
    Do a basic text processing.

    Parameters
    -----------
    text : string

    Returns
    --------
    This function returns pandas series having one list
    with clean text.
    1: split combined text
    2: lowercase
    3: expand apostrophes
    4: remove punctuation
    5: remove digits
    6: remove repeated substring
    7: remove stop words
    8: lemmatize

    Example:
    ========
    import re
    import string
    from nltk.corpus import stopwords
    import nltk
    
    text = "I'm typing text2num! areYou ? If yesyes say yes pals!"
    process_text(text)
    # ['typing', 'textnum', 'yes', 'say', 'yes', 'pal']

    """
    s = pd.Series([text])
    
    # step: Split combined words areYou ==> are You
    #s = s.apply(lambda x: re.sub(r'([a-z])([A-Z])',r'\1 \2',x))

    # step: lowercase
    s = s.str.lower()
    
    # step: remove ellipsis
    #s = s.str.replace(r'(\w)\u2026+',r'\1',regex=True)
    s = s.str.replace(r'…+',r'')

    # step: remove url
    #s = s.str.replace('http\S+|www.\S+', '', case=False)
    s = pd.Series([' '.join(y for y in x.split() if not is_url(y)) for x in s])

    # step: expand apostrophes
    map_apos = {
        "you're": 'you are',
        "i'm": 'i am',
        "he's": 'he is',
        "she's": 'she is',
        "it's": 'it is',
        "they're": 'they are',
        "can't": 'can not',
        "couldn't": 'could not',
        "don't": 'do not',
        "don;t": 'do not',
        "didn't": 'did not',
        "doesn't": 'does not',
        "isn't": 'is not',
        "wasn't": 'was not',
        "aren't": 'are not',
        "weren't": 'were not',
        "won't": 'will not',
        "wouldn't": 'would not',
        "hasn't": 'has not',
        "haven't": 'have not',
        "what's": 'what is',
        "that's": 'that is',
    }

    sa = pd.Series(s.str.split()[0])
    sb = sa.map(map_apos).fillna(sa)
    sentence = sb.str.cat(sep=' ')
    s = pd.Series([sentence])
    
    # step: expand shortcuts
    shortcuts = {'<3': 'love',
                 'awsm': 'awesome',
                 'b4': 'before',
                 'bc': 'because',
                 'bday': 'birthday',
                 'dm': 'direct message',
                 'doin': 'doing',
                 'gr8': 'great',
                 'gud': 'good',
                 'h8': 'hate',
                 'hw': 'how',
                 'idc': 'i do not care',
                 'idgaf': 'hate',
                 'irl': 'in real life',
                 'k': 'okay',
                 'lv': 'love',
                 'm': 'am',
                 'r': 'are',
                 'rt': 'retweet',
                 'ttyl': 'talk to you later',
                 'ty': 'thank you',
                 'u': 'you',
                 'wlcm': 'welcome',
                 'wtf': 'hate',
                 'xoxo': 'love',
                 'y': 'why',
                 'yolo': 'you only live once'}

    sa = pd.Series(s.str.split()[0])
    sb = sa.map(shortcuts).fillna(sa)
    sentence = sb.str.cat(sep=' ')
    s = pd.Series([sentence])

    # step: remove punctuation
    s = s.str.translate(str.maketrans(' ',' ',
                                        string.punctuation))
    # step: remove digits
    s = s.str.translate(str.maketrans(' ', ' ', '\n'))
    s = s.str.translate(str.maketrans(' ', ' ', string.digits))

    # step: remove repeated substring yesyes ==> yes
    s = s.str.replace(r'(\w+)\1',r'\1',regex=True)

    # step: remove stop words
    stop = set(stopwords.words('English'))
    extra_stop_words = ['...']
    stop.update(extra_stop_words) # inplace operation
    s = s.str.split()
    s = s.apply(lambda x: [I for I in x if I not in stop])

    # step: convert word to base form or lemmatize
    lemmatizer = nltk.stem.WordNetLemmatizer()
    s = s.apply(lambda lst: [lemmatizer.lemmatize(word) 
                               for word in lst])

    return s.to_numpy()[0]

def add_features(df):
    df[mcl] = df[maincol].apply(process_text)
    df[mc] = df[mcl].str.join(' ')
    df['hashtags_lst'] = df[maincol].str.findall(r'#.*?(?=\s|$)')
    
    #df['hashtags'] = df[maincol].apply(lambda x: len([x for x in x.split() if x.startswith('#')]))
    
    df['hashtags'] = df['hashtags_lst'].str.join(' ')

    return df

print("Creating clean tweet and hashtags ...")
df = parallelize_dataframe(df, add_features)

#======================= Text Feature Generation =====
def create_text_features(df):
    # total
    df['total_length'] = df[maincol].apply(len)

    # num of word and sentence
    df['num_words'] = df[maincol].apply(lambda x: len(x.split()))

    df['num_sent']=df[maincol].apply(lambda x: 
                                len(re.findall("\n",str(x)))+1)

    df['num_unique_words'] = df[maincol].apply(
        lambda x: len(set(w for w in x.split())))

    df["num_words_title"] = df[maincol].apply(lambda x: len([w for w in str(x).split() if w.istitle()]))

    df['num_uppercase'] = df[maincol].apply(
        lambda x: sum(1 for c in x if c.isupper()))

    # num of certain characters ! ? . @
    df['num_exclamation_marks'] = df[maincol].apply(lambda x: x.count('!'))

    df['num_question_marks'] = df[maincol].apply(lambda x: x.count('?'))

    df['num_punctuation'] = df[maincol].apply(
        lambda x: sum(x.count(w) for w in '.,;:'))

    df['num_symbols'] = df[maincol].apply(
        lambda x: sum(x.count(w) for w in '*&$%'))
    
    df['num_digits'] = df[maincol].apply(lambda x: len([x for x in x.split() if x.isdigit()]))

    # average
    df["avg_word_len"] = df[maincol].apply(lambda x: np.mean([len(w) for w in str(x).split()]))
    
    df['avg_uppercase'] = df.apply(
        lambda row: float(row['num_uppercase'])/float(row['total_length']),
                                    axis=1)

    df['avg_unique'] = df['num_unique_words'] / df['num_words']
    
    return df

print("Adding Text features ...")
df = parallelize_dataframe(df, create_text_features)

#===================== Manipulating emoticons and emojis
from emojis import *
from emoticons import *

def convert_emoticons(text):
    for emot in EMOTICONS:
        text = re.sub(u'('+emot+')', "_".join(EMOTICONS[emot].replace(",","").split()), text)
    return text

def convert_emojis(text):
    for emot in UNICODE_EMO:
        text = re.sub(r'('+emot+')', "_".join(UNICODE_EMO[emot].replace(",","").replace(":","").split()), text)
    return text

def process_text_emoji(text):
    """
    Do a basic text processing.

    Parameters
    -----------
    text : string
        
    Returns
    --------
    This function returns pandas series having one list
    with clean text.
    1: split combined text
    2: lowercase
    3: expand apostrophes
    4: remove punctuation
    5: remove digits
    6: remove repeated substring
    7: remove stop words
    8: lemmatize

    Example:
    ========
    import re
    import string
    from nltk.corpus import stopwords
    import nltk
    
    text = "I'm typing text2num! areYou ? If yesyes say yes pals!"
    process_text(text)
    # ['typing', 'textnum', 'yes', 'say', 'yes', 'pal']

    """
    s = pd.Series([text])
    
    # step: decode unicode characters
    s = s.apply(unidecode.unidecode)
    
    # step: howareyou ==> how are you
    s = s.apply(lambda x: ' '.join(wordninja.split(x)))
    
    # step: expand emoticons and emojis
    s = s.apply(convert_emoticons)
    s = s.apply(convert_emojis)

    # step: Split combined words areYou ==> are You
    #s = s.apply(lambda x: re.sub(r'([a-z])([A-Z])',r'\1 \2',x))

    # step: lowercase
    s = s.str.lower()
    
    # step: remove ellipsis
    #s = s.str.replace(r'(\w)\u2026+',r'\1',regex=True)
    s = s.str.replace(r'…+',r'')

    # step: remove url
    #s = s.str.replace('http\S+|www.\S+', '', case=False)
    s = pd.Series([' '.join(y for y in x.split() if not is_url(y)) for x in s])

    # step: expand apostrophes
    map_apos = {
        "you're": 'you are',
        "i'm": 'i am',
        "he's": 'he is',
        "she's": 'she is',
        "it's": 'it is',
        "they're": 'they are',
        "can't": 'can not',
        "couldn't": 'could not',
        "don't": 'do not',
        "don;t": 'do not',
        "didn't": 'did not',
        "doesn't": 'does not',
        "isn't": 'is not',
        "wasn't": 'was not',
        "aren't": 'are not',
        "weren't": 'were not',
        "won't": 'will not',
        "wouldn't": 'would not',
        "hasn't": 'has not',
        "haven't": 'have not',
        "what's": 'what is',
        "that's": 'that is',
    }

    sa = pd.Series(s.str.split()[0])
    sb = sa.map(map_apos).fillna(sa)
    sentence = sb.str.cat(sep=' ')
    s = pd.Series([sentence])
    
    # step: expand shortcuts
    shortcuts = {'u': 'you', 'y': 'why', 'r': 'are',
                 'doin': 'doing', 'hw': 'how',
                 'k': 'okay', 'm': 'am', 'b4': 'before',
                 'idc': "i do not care", 'ty': 'thankyou',
                 'wlcm': 'welcome', 'bc': 'because',
                 '<3': 'love', 'xoxo': 'love',
                 'ttyl': 'talk to you later', 'gr8': 'great',
                 'bday': 'birthday', 'awsm': 'awesome',
                 'gud': 'good', 'h8': 'hate',
                 'lv': 'love', 'dm': 'direct message',
                 'rt': 'retweet', 'wtf': 'hate',
                 'idgaf': 'hate','irl': 'in real life',
                 'yolo': 'you only live once'}

    sa = pd.Series(s.str.split()[0])
    sb = sa.map(shortcuts).fillna(sa)
    sentence = sb.str.cat(sep=' ')
    s = pd.Series([sentence])

    # step: remove punctuation
    s = s.str.translate(str.maketrans(' ',' ',
                                        string.punctuation))
    # step: remove digits
    s = s.str.translate(str.maketrans(' ', ' ', '\n'))
    s = s.str.translate(str.maketrans(' ', ' ', string.digits))

    # step: remove repeated substring yesyes ==> yes
    s = s.str.replace(r'(\w+)\1',r'\1',regex=True)

    # step: remove stop words
    stop = set(stopwords.words('English'))
    extra_stop_words = ['...']
    stop.update(extra_stop_words) # inplace operation
    s = s.str.split()
    s = s.apply(lambda x: [I for I in x if I not in stop])

    # step: convert word to base form or lemmatize
    lemmatizer = nltk.stem.WordNetLemmatizer()
    s = s.apply(lambda lst: [lemmatizer.lemmatize(word) 
                               for word in lst])

    return s.to_numpy()[0]

def add_features_emoji(df):
    # we need to remove url first
    df[mcle] = df[maincol].str.replace('http\S+|www.\S+', '', case=False)
    df[mcle] = df[mcle].apply(process_text_emoji)
    df[mce] = df[mcle].str.join(' ')

    return df

print("Adding Emoticons and emoji features ...")
df = parallelize_dataframe(df, add_features_emoji)

#===================== Save clean data =========================
df.to_csv('../data/processed/df_combined_clean.csv',index=False)

time_taken = time.time() - time_start
m,s = divmod(time_taken,60)
print(f"Data cleaning finished in {m} min {s:.2f} sec.")

# Data cleaning finished in 13.0 min 28.34 sec.

Overwriting sentiment_analysis_data_processing.py


# !open .

	index	id	label	tweet	tweet_lst_clean	tweet_clean	hashtags_lst	hashtags
0	0	1	0.0	#fingerprint #Pregnancy Test https://goo.gl/h1MfQV #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone	[fingerprint, pregnancy, test, android, aps, beautiful, cute, health, igers, iphoneonly, iphonesia, iphone]	fingerprint pregnancy test android aps beautiful cute health igers iphoneonly iphonesia iphone	[#fingerprint, #Pregnancy, #android, #apps, #beautiful, #cute, #health, #igers, #iphoneonly, #iphonesia, #iphone]	#fingerprint #Pregnancy #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone
1	1	2	0.0	Finally a transparant silicon case ^^ Thanks to my uncle :) #yay #Sony #Xperia #S #sonyexperias… http://instagram.com/p/YGEt5JC6JM/	[finaly, transparant, silicon, case, thanks, uncle, yay, sony, xperia, sonyexperias]	finaly transparant silicon case thanks uncle yay sony xperia sonyexperias	[#yay, #Sony, #Xperia, #S, #sonyexperias…]	#yay #Sony #Xperia #S #sonyexperias…
2	2	3	0.0	We love this! Would you go? #talk #makememories #unplug #relax #iphone #smartphone #wifi #connect... http://fb.me/6N3LsUpCu	[love, would, go, talk, makemories, unplug, relax, iphone, smartphone, wifi, conect]	love would go talk makemories unplug relax iphone smartphone wifi conect	[#talk, #makememories, #unplug, #relax, #iphone, #smartphone, #wifi, #connect...]	#talk #makememories #unplug #relax #iphone #smartphone #wifi #connect...
3	3	4	0.0	I'm wired I know I'm George I was made that way ;) #iphone #cute #daventry #home http://instagr.am/p/Li_5_ujS4k/	[wired, know, george, made, way, iphone, cute, daventry, home]	wired know george made way iphone cute daventry home	[#iphone, #cute, #daventry, #home]	#iphone #cute #daventry #home
4	4	5	1.0	What amazing service! Apple won't even talk to me about a question I have unless I pay them $19.95 for their stupid support!	[amazing, service, aple, wil, even, talk, question, unles, pay, stupid, suport]	amazing service aple wil even talk question unles pay stupid suport	[]

	index	id	label	tweet	tweet_lst_clean	tweet_clean	hashtags_lst	hashtags	tweet_lst_clean_emoji	tweet_clean_emoji
0	0	1	0.0	#fingerprint #Pregnancy Test https://goo.gl/h1MfQV #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone	[fingerprint, pregnancy, test, android, aps, beautiful, cute, health, igers, iphoneonly, iphonesia, iphone]	fingerprint pregnancy test android aps beautiful cute health igers iphoneonly iphonesia iphone	[#fingerprint, #Pregnancy, #android, #apps, #beautiful, #cute, #health, #igers, #iphoneonly, #iphonesia, #iphone]	#fingerprint #Pregnancy #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone	[fingerprint, pregnancy, test, android, aps, beautiful, cute, health, iger, iphone, iphones, iphone]	fingerprint pregnancy test android aps beautiful cute health iger iphone iphones iphone
1	1	2	0.0	Finally a transparant silicon case ^^ Thanks to my uncle :) #yay #Sony #Xperia #S #sonyexperias… http://instagram.com/p/YGEt5JC6JM/	[finaly, transparant, silicon, case, thanks, uncle, yay, sony, xperia, sonyexperias]	finaly transparant silicon case thanks uncle yay sony xperia sonyexperias	[#yay, #Sony, #Xperia, #S, #sonyexperias…]	#yay #Sony #Xperia #S #sonyexperias…	[finaly, trans, paran, silicon, case, thanks, uncle, yay, sony, x, peri, sony, ex, peri]	finaly trans paran silicon case thanks uncle yay sony x peri sony ex peri
2	2	3	0.0	We love this! Would you go? #talk #makememories #unplug #relax #iphone #smartphone #wifi #connect... http://fb.me/6N3LsUpCu	[love, would, go, talk, makemories, unplug, relax, iphone, smartphone, wifi, conect]	love would go talk makemories unplug relax iphone smartphone wifi conect	[#talk, #makememories, #unplug, #relax, #iphone, #smartphone, #wifi, #connect...]	#talk #makememories #unplug #relax #iphone #smartphone #wifi #connect...	[love, would, go, talk, make, memory, unplug, relax, iphone, smartphone, wi, fi, conect]	love would go talk make memory unplug relax iphone smartphone wi fi conect
3	3	4	0.0	I'm wired I know I'm George I was made that way ;) #iphone #cute #daventry #home http://instagr.am/p/Li_5_ujS4k/	[wired, know, george, made, way, iphone, cute, daventry, home]	wired know george made way iphone cute daventry home	[#iphone, #cute, #daventry, #home]	#iphone #cute #daventry #home	[wired, know, george, made, way, iphone, cute, daventry, home]	wired know george made way iphone cute daventry home
4	4	5	1.0	What amazing service! Apple won't even talk to me about a question I have unless I pay them $19.95 for their stupid support!	[amazing, service, aple, wil, even, talk, question, unles, pay, stupid, suport]	amazing service aple wil even talk question unles pay stupid suport	[]		[amazing, service, aple, wil, even, talk, question, unles, pay, stupid, suport]	amazing service aple wil even talk question unles pay stupid suport

	index	id	label	tweet	tweet_lst_clean	tweet_clean	hashtags_lst	hashtags	tweet_lst_clean_emoji	tweet_clean_emoji
0	0	1	0.0	#fingerprint #Pregnancy Test https://goo.gl/h1MfQV #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone	[fingerprint, pregnancy, test, android, aps, beautiful, cute, health, igers, iphoneonly, iphonesia, iphone]	fingerprint pregnancy test android aps beautiful cute health igers iphoneonly iphonesia iphone	[#fingerprint, #Pregnancy, #android, #apps, #beautiful, #cute, #health, #igers, #iphoneonly, #iphonesia, #iphone]	#fingerprint #Pregnancy #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone	[fingerprint, pregnancy, test, android, aps, beautiful, cute, health, iger, iphone, iphones, iphone]	fingerprint pregnancy test android aps beautiful cute health iger iphone iphones iphone
1	1	2	0.0	Finally a transparant silicon case ^^ Thanks to my uncle :) #yay #Sony #Xperia #S #sonyexperias… http://instagram.com/p/YGEt5JC6JM/	[finaly, transparant, silicon, case, thanks, uncle, yay, sony, xperia, sonyexperias]	finaly transparant silicon case thanks uncle yay sony xperia sonyexperias	[#yay, #Sony, #Xperia, #S, #sonyexperias…]	#yay #Sony #Xperia #S #sonyexperias…	[finaly, trans, paran, silicon, case, thanks, uncle, yay, sony, x, peri, sony, ex, peri]	finaly trans paran silicon case thanks uncle yay sony x peri sony ex peri
2	2	3	0.0	We love this! Would you go? #talk #makememories #unplug #relax #iphone #smartphone #wifi #connect... http://fb.me/6N3LsUpCu	[love, would, go, talk, makemories, unplug, relax, iphone, smartphone, wifi, conect]	love would go talk makemories unplug relax iphone smartphone wifi conect	[#talk, #makememories, #unplug, #relax, #iphone, #smartphone, #wifi, #connect...]	#talk #makememories #unplug #relax #iphone #smartphone #wifi #connect...	[love, would, go, talk, make, memory, unplug, relax, iphone, smartphone, wi, fi, conect]	love would go talk make memory unplug relax iphone smartphone wi fi conect
3	3	4	0.0	I'm wired I know I'm George I was made that way ;) #iphone #cute #daventry #home http://instagr.am/p/Li_5_ujS4k/	[wired, know, george, made, way, iphone, cute, daventry, home]	wired know george made way iphone cute daventry home	[#iphone, #cute, #daventry, #home]	#iphone #cute #daventry #home	[wired, know, george, made, way, iphone, cute, daventry, home]	wired know george made way iphone cute daventry home
4	4	5	1.0	What amazing service! Apple won't even talk to me about a question I have unless I pay them $19.95 for their stupid support!	[amazing, service, aple, wil, even, talk, question, unles, pay, stupid, suport]	amazing service aple wil even talk question unles pay stupid suport	[]		[amazing, service, aple, wil, even, talk, question, unles, pay, stupid, suport]	amazing service aple wil even talk question unles pay stupid suport

	tweet	tweet_lst_clean	tweet_lst_clean_emoji
95	#organic #farming #orange and #apple #snack *for more high quality photos… https://www.instagram.com/p/BHPAdCnDYtz/	[organic, farming, orange, aple, snack, high, quality, photo]	[organic, farming, orange, aple, snack, high, quality, photo]
96	Fellow passenger owns @oneplus One introduced myself and happen to speak how amazing the big boy is! #OnePlus growing strong!	[felow, pasenger, owns, oneplus, one, introduced, hapen, speak, amazing, big, boy, oneplus, growing, strong]	[felow, pasenger, owns, one, plus, one, introduced, hapen, speak, amazing, big, boy, one, plus, growing, strong]
97	New dress, thanks to momma #apple #dress #littleblackdress #dresses #bestmomma #cute #beautiful ... http://dlvr.it/5PJzxv	[new, dres, thanks, moma, aple, dres, litleblackdres, dreses, bestmoma, cute, beautiful]	[new, dres, thanks, moma, aple, dres, litle, black, dres, dreses, best, moma, cute, beautiful]
98	Finally got the android 2.1 update for the moto milestone :) happy but came 2 weeks late :S #android #motorola	[finaly, got, android, update, moto, milestone, hapy, came, weks, late, android, motorola]	[finaly, got, android, update, moto, milestone, hapy, came, weks, late, android, motorola]
99	ok. second time in seven days that my #apple #magicmouse batteries need to be replaced. #thisaintright #apple xoxo, antigirl	[ok, second, time, seven, day, aple, magicmouse, bateries, ned, replaced, thisaintright, aple, xo, antigirl]	[ok, second, time, seven, day, aple, magic, mouse, bateries, ned, replaced, right, aple, x, oxo, anti, girl]

Table of Contents

Description¶

Load the libraries¶

Load the data¶

Useful Functions¶

Text Data Processing¶

Process text¶

Emoticons and emojis¶

Text Features Generation¶

Script¶

Script for emoji¶