Source code for pybrokk.duster

import pandas as pd
from pybrokk.create_id import create_id
from pybrokk.text_from_url import text_from_url

[docs]def duster(urls): """ Prepares a pandas dataframe by webscraping raw text from a list of urls ready to be input into a machine learning model. Parameters ---------- urls: list list of target urls as strings Returns ---------- df: pandas dataframe A dataframe with the webpage identifiers as a index, the raw url, and the raw text from the webpage with extra line breaks removed. Examples ---------- >>> from pybrokk.duster import duster >>> duster(['https://www.cnn.com/world', 'https://www.foxnews.com/world', 'https://www.cbc.ca/news/world']) url raw_text id cnn1 https://www.cnn.com/world World news - breaking news, video, headlines ... foxnews1 https://www.foxnews.com/world World | Fox NewsFox News U.S.PoliticsMediaOp... cbc1 https://www.cbc.ca/news/world World - CBC NewsContentSkip to Main ContentAcc... """ #scrape text from the web output = text_from_url(urls) #create Dataframe from dictionary output of text_from_url() df = pd.DataFrame.from_dict(output, orient='index', columns=["raw_text"]).reset_index().rename(columns={"index":"url"}) #remove line breaks df['raw_text'] = df['raw_text'].str.replace("\n", "") #add id as index df['id'] = create_id(df['url'].tolist()) df = df.set_index('id') return df