Source code for pybrokk.pybrokk

import requests
from bs4 import BeautifulSoup
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

[docs]def create_id(urls): """ Convert a list of provided urls into a list of unique identifiers for use in downstream functions Parameters ---------- urls: list A list of urls as strings Returns ---------- ids: list A list of unique identifiers as strings Examples ---------- >>> from pybrokk.create_id import create_id >>> create_id(['https://www.reddit.com/r/nba/', 'https://www.reddit.com/r/nfl/', 'https://vancouver.craigslist.org/search/apa', 'https://www.kijiji.ca/b-real-estate/richmond-bc/c34l1700288']) ['reddit1', 'reddit2', 'craigslist1', 'kijiji1'] """ ids = [] ids_dict = {} for url in urls: website_split = url.split(".") website_name = website_split[1] if website_name in ids_dict: ids_dict[website_name]["count"] += 1 else: ids_dict[website_name] = {"count": 1} ids.append(website_name + str(ids_dict[website_name]["count"])) return ids
[docs]def text_from_url(urls): """ This function takes a list of URLs and returns the parsed text as scraped from the URL using Beautiful Soup Parameters ---------- urls: list List of URLs to scrape as strings Returns ------- texts: dictionary Dictionary containing the url as keys and parsed text output as values Examples -------- >>> text_from_url(["https://realpython.github.io/fake-jobs/jobs/senior-python-developer-0.html", "https://realpython.github.io/fake-jobs/jobs/energy-engineer-1.html"]) >>> {'https://realpython.github.io/fake-jobs/jobs/senior-python-developer-0.html': '\n\n\n\n\nFake Python\n\n\n\n\n\n\n Fake Python\n \n\n Fake Jobs for Your Web Scraping Journey\n \n\n\n\n\nSenior Python Developer\nPayne, Roberts and Davis\n\nProfessional asset web application environmentally friendly detail-oriented asset. Coordinate educational dashboard agile employ growth opportunity. Company programs CSS explore role. Html educational grit web application. Oversea SCRUM talented support. Web Application fast-growing communities inclusive programs job CSS. Css discussions growth opportunity explore open-minded oversee. Css Python environmentally friendly collaborate inclusive role. Django no experience oversee dashboard environmentally friendly willing to learn programs. Programs open-minded programs asset.\nLocation: Stewartbury, AA\nPosted: 2021-04-08\n\n\n\n\n\n\n\n', 'https://realpython.github.io/fake-jobs/jobs/energy-engineer-1.html': '\n\n\n\n\nFake Python\n\n\n\n\n\n\n Fake Python\n \n\n Fake Jobs for Your Web Scraping Journey\n \n\n\n\n\nEnergy engineer\nVasquez-Davidson\n\nParty prevent live. Quickly candidate change although. Together type music hospital. Every speech support time operation wear often.\nLocation: Christopherville, AA\nPosted: 2021-04-08\n\n\n\n\n\n\n\n'} """ parse_res = {} for url in urls: page = requests.get(url) soup = BeautifulSoup(page.content, "html.parser") parse_res.update({url:soup.text}) return parse_res
[docs]def duster(urls): """ Prepares a pandas dataframe by webscraping raw text from a list of urls ready to be input into a machine learning model. Parameters ---------- urls: list list of target urls as strings Returns ---------- df: pandas dataframe A dataframe with the webpage identifiers as a index, the raw url, and the raw text from the webpage with extra line breaks removed. Examples ---------- >>> from pybrokk.duster import duster >>> duster(['https://www.cnn.com/world', 'https://www.foxnews.com/world', 'https://www.cbc.ca/news/world']) url raw_text id cnn1 https://www.cnn.com/world World news - breaking news, video, headlines ... foxnews1 https://www.foxnews.com/world World | Fox NewsFox News U.S.PoliticsMediaOp... cbc1 https://www.cbc.ca/news/world World - CBC NewsContentSkip to Main ContentAcc... """ #scrape text from the web output = text_from_url(urls) #create Dataframe from dictionary output of text_from_url() df = pd.DataFrame.from_dict(output, orient='index', columns=["raw_text"]).reset_index().rename(columns={"index":"url"}) #remove line breaks df['raw_text'] = df['raw_text'].str.replace("\n", "") #add id as index df['id'] = create_id(df['url'].tolist()) df = df.set_index('id') return df
[docs]def bow(df): """ Converts the last column of the data frame to a bag of words and return it along with other columns of the data frame. Parameters ---------- df : data frame a data frame with the last column of raw text Returns ---------- df_bow : data frame a data frame which consists of the n-1 first columns of the input data frame as its n-1 first columns, plus a bag of words of the input data frame in its following numerous columns. Examples ---------- >>> df = pd.DataFrame({ "url": ["https://www.cnn.com/world", "https://www.foxnews.com/world", "https://www.cbc.ca/news/world"], "url_id": ["cnn1","foxnews1","cbc1"], "text": ["Instagram has a faster chance of reaching me than CNN, and if I really want to know what's going on, I refresh my Twitter feed.", "I would appear on Fox News more easily than I would NPR.", "CBC has a very important mandate to bind Canada together in both official languages, tell local stories, and make sure we have a sense of our strength, our culture, our stories."] }) >>> df_bow(df) =============================== ========== ============================== url url_id text =============================== ========== ============================== https://www.cnn.com/world cnn1 Instagram has a faster ... https://www.foxnews.com/world foxnew1 I would appear on Fox ... https://www.cbc.ca/news/world cbc1 CBC has a very important ... ======== ====== ======== ====== ========= ====== appear bind canada ... tell twitter want ======== ====== ======== ====== ========= ====== 0 0 0 ... 0 1 1 1 0 0 ... 0 0 0 0 1 1 ... 1 0 0 """ words = CountVectorizer() words_matrix = words.fit_transform(df.iloc[:,-1]) words_array = words_matrix.toarray() df_temp = pd.DataFrame(data=words_array, columns = words.get_feature_names_out()) df_temp = df_temp.set_index(df.index) df_bow = pd.concat([df ,df_temp], axis=1) return df_bow