Source code for pybrokk.bow

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

[docs]def bow(df):
    """
    Converts the last column of the data frame to a bag of words and return it 
    along with other columns of the data frame.
    
    Parameters
    ----------
    df : data frame
        a data frame with the last column of raw text

           
    Returns
    ----------
    df_bow : data frame
        a data frame which consists of the n-1 first columns of the input data frame as its n-1 first columns,
        plus a bag of words of the input data frame in its following numerous columns. 

        
    Examples
    ----------
    >>> df = pd.DataFrame({
  "url": ["https://www.cnn.com/world",
          "https://www.foxnews.com/world",
          "https://www.cbc.ca/news/world"],
  "url_id": ["cnn1","foxnews1","cbc1"],
  "text": ["Instagram has a faster chance of reaching me than CNN, and if I really want to know what's going on, I refresh my Twitter feed.",
           "I would appear on Fox News more easily than I would NPR.",
           "CBC has a very important mandate to bind Canada together in both official languages, tell local stories, and make sure we have a sense of our strength, our culture, our stories."]
})

    >>> df_bow(df)
            ===============================  ==========  ============================== 
                        url                    url_id             text                   
            ===============================  ==========  ============================== 
             https://www.cnn.com/world         cnn1       Instagram has a faster ...       
             https://www.foxnews.com/world     foxnew1    I would appear on Fox ...        
             https://www.cbc.ca/news/world     cbc1       CBC has a very important ...     
                
                
            ======== ====== ========     ====== ========= ======
             appear   bind   canada  ...  tell	 twitter   want
            ======== ====== ========     ====== ========= ======
               0        0       0    ...   0       1        1
               1        0       0    ...   0       0        0
               0        1       1    ...   1       0        0
    """
    words = CountVectorizer()
    words_matrix = words.fit_transform(df.iloc[:,-1])
    words_array = words_matrix.toarray()
    df_temp = pd.DataFrame(data=words_array, columns = words.get_feature_names_out())
    df_temp = df_temp.set_index(df.index)
    df_bow = pd.concat([df ,df_temp], axis=1)
    return df_bow