Source code for pybrokk.pybrokk

import requests
from bs4 import BeautifulSoup
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

[docs]def create_id(urls):
    """
    Convert a list of provided urls into a list of unique identifiers for use in downstream functions
    
    Parameters
    ----------
    urls: list
        A list of urls as strings
        
    Returns
    ----------
    ids: list
        A list of unique identifiers as strings
        
    Examples
    ----------
    >>> from pybrokk.create_id import create_id
    >>> create_id(['https://www.reddit.com/r/nba/', 'https://www.reddit.com/r/nfl/', 'https://vancouver.craigslist.org/search/apa', 'https://www.kijiji.ca/b-real-estate/richmond-bc/c34l1700288'])
    ['reddit1', 'reddit2', 'craigslist1', 'kijiji1']
    """
    ids = []
    ids_dict = {}
    for url in urls:
        website_split = url.split(".")
        website_name = website_split[1]
        if website_name in ids_dict:
            ids_dict[website_name]["count"] += 1
        else:
            ids_dict[website_name] = {"count": 1}
        ids.append(website_name + str(ids_dict[website_name]["count"]))
        
    return ids

[docs]def text_from_url(urls):
    """
    This function takes a list of URLs and returns the parsed text as scraped from the URL using Beautiful Soup

    Parameters
    ----------
        urls: list
            List of URLs to scrape as strings
    
    Returns
    -------
    texts: dictionary
        Dictionary containing the url as keys and parsed text output as values

    Examples
    --------
    >>> text_from_url(["https://realpython.github.io/fake-jobs/jobs/senior-python-developer-0.html", "https://realpython.github.io/fake-jobs/jobs/energy-engineer-1.html"])
    >>> {'https://realpython.github.io/fake-jobs/jobs/senior-python-developer-0.html': '\n\n\n\n\nFake Python\n\n\n\n\n\n\n        Fake Python\n      \n\n        Fake Jobs for Your Web Scraping Journey\n      \n\n\n\n\nSenior Python Developer\nPayne, Roberts and Davis\n\nProfessional asset web application environmentally friendly detail-oriented asset. Coordinate educational dashboard agile employ growth opportunity. Company programs CSS explore role. Html educational grit web application. Oversea SCRUM talented support. Web Application fast-growing communities inclusive programs job CSS. Css discussions growth opportunity explore open-minded oversee. Css Python environmentally friendly collaborate inclusive role. Django no experience oversee dashboard environmentally friendly willing to learn programs. Programs open-minded programs asset.\nLocation: Stewartbury, AA\nPosted: 2021-04-08\n\n\n\n\n\n\n\n',
        'https://realpython.github.io/fake-jobs/jobs/energy-engineer-1.html': '\n\n\n\n\nFake Python\n\n\n\n\n\n\n        Fake Python\n      \n\n        Fake Jobs for Your Web Scraping Journey\n      \n\n\n\n\nEnergy engineer\nVasquez-Davidson\n\nParty prevent live. Quickly candidate change although. Together type music hospital. Every speech support time operation wear often.\nLocation: Christopherville, AA\nPosted: 2021-04-08\n\n\n\n\n\n\n\n'}

    
    """
    parse_res = {}

    for url in urls:
        page = requests.get(url)
        soup = BeautifulSoup(page.content, "html.parser")
        parse_res.update({url:soup.text})
    return parse_res

[docs]def duster(urls):
    """
    Prepares a pandas dataframe by webscraping raw text from a list of urls ready to be input into a machine learning model.
    
    Parameters
    ----------
    urls: list
        list of target urls as strings

    Returns
    ----------
    df: pandas dataframe
        A dataframe with the webpage identifiers as a index, the raw url, and the raw text from the webpage with extra line breaks removed.
        
    Examples
    ----------
    >>> from pybrokk.duster import duster
    >>> duster(['https://www.cnn.com/world', 'https://www.foxnews.com/world', 'https://www.cbc.ca/news/world'])
                                        url                                           raw_text
    id
    cnn1          https://www.cnn.com/world   World news - breaking news, video, headlines ...
    foxnews1  https://www.foxnews.com/world  World | Fox NewsFox News   U.S.PoliticsMediaOp...
    cbc1      https://www.cbc.ca/news/world  World - CBC NewsContentSkip to Main ContentAcc...         
    """
    #scrape text from the web
    output = text_from_url(urls)

    #create Dataframe from dictionary output of text_from_url()
    df = pd.DataFrame.from_dict(output, orient='index', columns=["raw_text"]).reset_index().rename(columns={"index":"url"})
    
    #remove line breaks
    df['raw_text'] = df['raw_text'].str.replace("\n", "")
    
    #add id as index
    df['id'] = create_id(df['url'].tolist())
    df = df.set_index('id')
    
    return df

[docs]def bow(df):
    """
    Converts the last column of the data frame to a bag of words and return it 
    along with other columns of the data frame.
    
    Parameters
    ----------
    df : data frame
        a data frame with the last column of raw text

           
    Returns
    ----------
    df_bow : data frame
        a data frame which consists of the n-1 first columns of the input data frame as its n-1 first columns,
        plus a bag of words of the input data frame in its following numerous columns. 

        
    Examples
    ----------
    >>> df = pd.DataFrame({
  "url": ["https://www.cnn.com/world",
          "https://www.foxnews.com/world",
          "https://www.cbc.ca/news/world"],
  "url_id": ["cnn1","foxnews1","cbc1"],
  "text": ["Instagram has a faster chance of reaching me than CNN, and if I really want to know what's going on, I refresh my Twitter feed.",
           "I would appear on Fox News more easily than I would NPR.",
           "CBC has a very important mandate to bind Canada together in both official languages, tell local stories, and make sure we have a sense of our strength, our culture, our stories."]
})

    >>> df_bow(df)
            ===============================  ==========  ============================== 
                        url                    url_id             text                   
            ===============================  ==========  ============================== 
             https://www.cnn.com/world         cnn1       Instagram has a faster ...       
             https://www.foxnews.com/world     foxnew1    I would appear on Fox ...        
             https://www.cbc.ca/news/world     cbc1       CBC has a very important ...     
                
                
            ======== ====== ========     ====== ========= ======
             appear   bind   canada  ...  tell	 twitter   want
            ======== ====== ========     ====== ========= ======
               0        0       0    ...   0       1        1
               1        0       0    ...   0       0        0
               0        1       1    ...   1       0        0
    """
    words = CountVectorizer()
    words_matrix = words.fit_transform(df.iloc[:,-1])
    words_array = words_matrix.toarray()
    df_temp = pd.DataFrame(data=words_array, columns = words.get_feature_names_out())
    df_temp = df_temp.set_index(df.index)
    df_bow = pd.concat([df ,df_temp], axis=1)
    return df_bow