Source code for pybrokk.duster

import pandas as pd
from pybrokk.create_id import create_id
from pybrokk.text_from_url import text_from_url

[docs]def duster(urls):
    """
    Prepares a pandas dataframe by webscraping raw text from a list of urls ready to be input into a machine learning model.
    
    Parameters
    ----------
    urls: list
        list of target urls as strings

    Returns
    ----------
    df: pandas dataframe
        A dataframe with the webpage identifiers as a index, the raw url, and the raw text from the webpage with extra line breaks removed.
        
    Examples
    ----------
    >>> from pybrokk.duster import duster
    >>> duster(['https://www.cnn.com/world', 'https://www.foxnews.com/world', 'https://www.cbc.ca/news/world'])
                                        url                                           raw_text
    id
    cnn1          https://www.cnn.com/world   World news - breaking news, video, headlines ...
    foxnews1  https://www.foxnews.com/world  World | Fox NewsFox News   U.S.PoliticsMediaOp...
    cbc1      https://www.cbc.ca/news/world  World - CBC NewsContentSkip to Main ContentAcc...         
    """
    #scrape text from the web
    output = text_from_url(urls)

    #create Dataframe from dictionary output of text_from_url()
    df = pd.DataFrame.from_dict(output, orient='index', columns=["raw_text"]).reset_index().rename(columns={"index":"url"})
    
    #remove line breaks
    df['raw_text'] = df['raw_text'].str.replace("\n", "")
    
    #add id as index
    df['id'] = create_id(df['url'].tolist())
    df = df.set_index('id')
    
    return df