Source code for pybrokk.bow

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

[docs]def bow(df): """ Converts the last column of the data frame to a bag of words and return it along with other columns of the data frame. Parameters ---------- df : data frame a data frame with the last column of raw text Returns ---------- df_bow : data frame a data frame which consists of the n-1 first columns of the input data frame as its n-1 first columns, plus a bag of words of the input data frame in its following numerous columns. Examples ---------- >>> df = pd.DataFrame({ "url": ["https://www.cnn.com/world", "https://www.foxnews.com/world", "https://www.cbc.ca/news/world"], "url_id": ["cnn1","foxnews1","cbc1"], "text": ["Instagram has a faster chance of reaching me than CNN, and if I really want to know what's going on, I refresh my Twitter feed.", "I would appear on Fox News more easily than I would NPR.", "CBC has a very important mandate to bind Canada together in both official languages, tell local stories, and make sure we have a sense of our strength, our culture, our stories."] }) >>> df_bow(df) =============================== ========== ============================== url url_id text =============================== ========== ============================== https://www.cnn.com/world cnn1 Instagram has a faster ... https://www.foxnews.com/world foxnew1 I would appear on Fox ... https://www.cbc.ca/news/world cbc1 CBC has a very important ... ======== ====== ======== ====== ========= ====== appear bind canada ... tell twitter want ======== ====== ======== ====== ========= ====== 0 0 0 ... 0 1 1 1 0 0 ... 0 0 0 0 1 1 ... 1 0 0 """ words = CountVectorizer() words_matrix = words.fit_transform(df.iloc[:,-1]) words_array = words_matrix.toarray() df_temp = pd.DataFrame(data=words_array, columns = words.get_feature_names_out()) df_temp = df_temp.set_index(df.index) df_bow = pd.concat([df ,df_temp], axis=1) return df_bow