In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
import re
import ast
In [2]:
main_link = 'https://leg.colorado.gov/bills/'
In [3]:
with open('legislation_2018.txt', 'r') as y2018:
    y2018_r = ast.literal_eval(y2018.read())
In [4]:
print(len(y2018_r))
for i in y2018_r[1:2]:
    print(f'{main_link}{i}')
    print(f'{i}')
    
784
https://leg.colorado.gov/bills/HB18-1002
HB18-1002
In [5]:
data = {
    'bill_number':[],
    'status_1':[],
    'Last_action':[],
    'governor_action':[],
    'prime_sponsor':[],
    'sponsor':[],
    'Co_sponsor':[],
    'Title':[],
    'Subtitle':[],
    'Session':[],
    'Subjects':[],
    'committee':[],
    'Senate_vote_date':[],
    'Senate_vote_action':[],
    'Senate_vote_motion':[],
    'Senate_vote_count':[],
    'Senate_vote_link':[],
    'House_vote_date':[],
    'House_vote_action':[],
    'House_vote_motion':[],
    'House_vote_count':[],
    'House_vote_link':[]
}

#data
In [6]:
def get_data(data):
    df = pd.DataFrame(data, columns=['bill_number','status_1','status_2','prime_sponsor','sponsor','Co_sponsor',
                                    'Title', 'Subtitle','Session','Subjects', 'committee', 'Senate_vote_date',
                                     'Senate_vote_action', 'Senate_vote_motion', 'Senate_vote_count', 'Senate_vote_link',
                                     'House_vote_date', 'House_vote_action', 'House_vote_motion', 'House_vote_count',
                                     'House_vote_link'
                                    ])
    return df
In [7]:
final_dict = {}
def get_status_attributes(soup2, bill,at,bill_history2,sponsers_all, bill_info, committee2, votes_exist2):  
    motion_types = ['CNCRNT RES', 'RES', 'RESOLUTION', 'MEM', 'MEMORIAL', 'Bill']
    motion_types = [x.lower() for x in motion_types]
    action_types = ['Third Reading', 'RESOLUTIONS','Consideration of Resolutions', 'MEMORIALS',  'Consideration of Memorials']
    action_types = [y.lower() for y in action_types]
    
    stat = []
    prime_spons = []
    spons = []
    Co_spons = []
    get_subject=[]
    committee3=[]
    
    for i in at:
        stat.append(i.find('div', class_="field-item even").string)
    for bill_history3 in bill_history2:
        if len(bill_history3.find_all('td')) != 0:           
            for last_action in bill_history3.find('td', attrs={"data-label":"Action"}):
                data['Last_action'].append(last_action)
            all_action = []
            gov_action = []
            for actions in bill_history3.find_all('td', attrs={"data-label":"Action"}):
                all_action.append(actions.string.lower())
            for actions1 in all_action:
                if 'governor' in actions1:
                    gov_action.append(actions1)
            data['governor_action'].append(';'.join(gov_action)) if len(gov_action) != 0 else (data['governor_action'].append(np.NaN))

    for i in sponsers_all:
        if i.td.string=='Prime Sponsor':
            for ps in i.find_all('a'):
                prime_spons.append(ps.string)
        if i.td.string=='Sponsor':
            for s in i.find_all('a'):
                spons.append(s.string)
        if i.td.string=='Co-sponsor':
            for cs in i.find_all('a'):
                Co_spons.append(cs.string)

    for i in bill_info:
        bill_title = i.find('h1', class_='node__title node-title')
        for j in i.find_all('div', 'field field-name-field-bill-long-title field-type-text-long field-label-hidden'):
            bill_subtitle = j.find('div', 'field-item even')
        for k in i.find_all('div', 'bill-session'):
            bill_session = k.find('div', 'field-item even')
            
        for j in i.find_all('div', class_='bill-subjects'):
            subjects = j.find_all('div', class_='field-items')
            for k in subjects:
                for l in k.find_all('div'):
                    get_subject.append(l.string)
            
    if len(committee2) != 0:
        for i in soup2.find_all('div', class_='committee-item'):
            committee3.append(i.text)
        data['committee'].append(committee3[0].strip('\n\n'))
    if len(committee2) == 0:
        data['committee'].append(np.NaN)

    data['bill_number'].append(bill)
    data['status_1'].append(stat[-1])
    data['prime_sponsor'].append(prime_spons)
    data['sponsor'].append(spons)
    data['Co_sponsor'].append(Co_spons)
    data['Title'].append(bill_title.string)
    data['Subtitle'].append(bill_subtitle.string)
    data['Session'].append(bill_session.string)
    data['Subjects'].append(get_subject)
    
    if len(votes_exist2) != 0:
        if len(soup2.find('div', id='bill-documents-tabs4').find_all('li', class_='accordion-item')) == 2:
            for i in soup2.find('div', id='bill-documents-tabs4').find_all('li', class_='accordion-item'):
                votes_main_link = 'https://leg.colorado.gov'
                for j in i.find_all('h5'):
                    if 'House' in j.text:
                        House_vote_date1 = []
                        House_vote_action1 = []
                        House_vote_motion1 = []
                        House_vote_count1 = []
                        House_vote_link1 = []
                        for house_b_info in i.find_all('tr'):
                            if len(house_b_info.find_all('div', class_='field field-name-field-vote-action field-type-text field-label-hidden')) != 0 and len(house_b_info.find_all('div',
                                                                                                                                                          class_=('field field-name-field-vote-motion field-type-text field-label-hidden'))) != 0:

                                #print(house_b_info.find('div', class_='field field-name-field-vote-action field-type-text field-label-hidden').find('div', class_="field-item even").text)
                                if (house_b_info.find('div', class_='field field-name-field-vote-action field-type-text field-label-hidden'). find('div', class_="field-item even").text.lower()) in action_types and (house_b_info.find('div', class_='field field-name-field-vote-motion field-type-text field-label-hidden').find('div', class_="field-item even").text.lower()) in motion_types:
                                    #print('Yes it is in')
                                    House_vote_date1.append(house_b_info.find('div', class_='field field-name-field-date field-type-datetime field-label-hidden').text.strip())
                                    House_vote_action1.append(house_b_info.find('div', class_='field field-name-field-vote-action field-type-text field-label-hidden').text.strip())
                                    House_vote_motion1.append(house_b_info.find('div', class_='field field-name-field-vote-motion field-type-text field-label-hidden').text.strip())
                                    House_vote_count1.append(house_b_info.find('div', class_='field field-name-field-vote-text field-type-text field-label-hidden').text.strip())
                                    House_vote_link1.append(f"{votes_main_link}{house_b_info.find('a').get('href')}")

                        data['House_vote_date'].append(";".join(House_vote_date1)) if len(House_vote_date1) > 1 else (data['House_vote_date'].append(House_vote_date1[0]))
                        data['House_vote_action'].append(";".join(House_vote_action1)) if len(House_vote_action1) > 1 else (data['House_vote_action'].append(House_vote_action1[0]))
                        data['House_vote_motion'].append(";".join(House_vote_motion1)) if len(House_vote_motion1) > 1 else (data['House_vote_motion'].append(House_vote_motion1[0]))  
                        data['House_vote_count'].append(";".join(House_vote_count1)) if len(House_vote_count1) > 1 else (data['House_vote_count'].append(House_vote_count1[0]))                     
                        data['House_vote_link'].append(";".join(House_vote_link1)) if len(House_vote_link1) > 1 else (data['House_vote_link'].append(House_vote_link1[0]))
                    if 'Senate' in j.text:
                        Senate_vote_date1 = []
                        Senate_vote_action1 = []
                        Senate_vote_motion1 = []
                        Senate_vote_count1 = []
                        Senate_vote_link1 = []
                        for senate_b_info in i.find_all('tr'):
                            if len(senate_b_info.find_all('div', class_='field field-name-field-vote-action field-type-text field-label-hidden')) != 0 and len(senate_b_info.find_all('div',
                                                                                                                                                          class_=('field field-name-field-vote-motion field-type-text field-label-hidden'))) != 0:
                                #print(senate_b_info.find('div', class_='field field-name-field-vote-action field-type-text field-label-hidden').find('div', class_="field-item even").text)
                                if (senate_b_info.find('div', class_='field field-name-field-vote-action field-type-text field-label-hidden'). find('div', class_="field-item even").text.lower()) in action_types and (senate_b_info.find('div', class_='field field-name-field-vote-motion field-type-text field-label-hidden').find('div', class_="field-item even").text.lower()) in motion_types:
                                    Senate_vote_date1.append(senate_b_info.find('div', class_='field field-name-field-date field-type-datetime field-label-hidden').text.strip())
                                    Senate_vote_action1.append(senate_b_info.find('div', class_='field field-name-field-vote-action field-type-text field-label-hidden').text.strip())
                                    Senate_vote_motion1.append(senate_b_info.find('div', class_='field field-name-field-vote-motion field-type-text field-label-hidden').text.strip())
                                    Senate_vote_count1.append(senate_b_info.find('div', class_='field field-name-field-vote-text field-type-text field-label-hidden').text.strip())
                                    Senate_vote_link1.append(f"{votes_main_link}{senate_b_info.find('a').get('href')}")
                        data['Senate_vote_date'].append(";".join(Senate_vote_date1)) if len(Senate_vote_date1) > 1 else (data['Senate_vote_date'].append(Senate_vote_date1[0]))
                        data['Senate_vote_action'].append(";".join(Senate_vote_action1)) if len(Senate_vote_action1) > 1 else (data['Senate_vote_action'].append(Senate_vote_action1[0]))
                        data['Senate_vote_motion'].append(";".join(Senate_vote_motion1)) if len(Senate_vote_motion1) > 1 else (data['Senate_vote_motion'].append(Senate_vote_motion1[0]))  
                        data['Senate_vote_count'].append(";".join(Senate_vote_count1)) if len(Senate_vote_count1) > 1 else (data['Senate_vote_count'].append(Senate_vote_count1[0])) 
                        data['Senate_vote_link'].append(";".join(Senate_vote_link1)) if len(Senate_vote_link1) > 1 else (data['Senate_vote_link'].append(Senate_vote_link1[0])) 
        
        if len(soup2.find('div', id='bill-documents-tabs4').find_all('li', class_='accordion-item')) == 1:
            print('length is 1')
            for i in soup2.find('div', id='bill-documents-tabs4').find_all('li', class_='accordion-item'):
                votes_main_link = 'https://leg.colorado.gov'
                for j in i.find_all('h5'):
                    if 'House' in j.text:
                        #print(j.text)
                        House_vote_date1 = []
                        House_vote_action1 = []
                        House_vote_motion1 = []
                        House_vote_count1 = []
                        House_vote_link1 = []
                        for house_b_info in i.find_all('tr'):
                            if len(house_b_info.find_all('div', class_='field field-name-field-vote-action field-type-text field-label-hidden')) != 0 and len(house_b_info.find_all('div',
                                                                                                                                                          class_=('field field-name-field-vote-motion field-type-text field-label-hidden'))) != 0:

                                #print(house_b_info.find('div', class_='field field-name-field-vote-action field-type-text field-label-hidden').find('div', class_="field-item even").text)
                                if (house_b_info.find('div', class_='field field-name-field-vote-action field-type-text field-label-hidden'). find('div', class_="field-item even").text.lower()) in action_types and (house_b_info.find('div', class_='field field-name-field-vote-motion field-type-text field-label-hidden').find('div', class_="field-item even").text.lower()) in motion_types:
                                    House_vote_date1.append(house_b_info.find('div', class_='field field-name-field-date field-type-datetime field-label-hidden').text.strip())
                                    House_vote_action1.append(house_b_info.find('div', class_='field field-name-field-vote-action field-type-text field-label-hidden').text.strip())
                                    House_vote_motion1.append(house_b_info.find('div', class_='field field-name-field-vote-motion field-type-text field-label-hidden').text.strip())
                                    House_vote_count1.append(house_b_info.find('div', class_='field field-name-field-vote-text field-type-text field-label-hidden').text.strip())
                                    House_vote_link1.append(f"{votes_main_link}{house_b_info.find('a').get('href')}")

                        data['House_vote_date'].append(";".join(House_vote_date1)) if len(House_vote_date1) > 1 else (data['House_vote_date'].append(House_vote_date1[0]))
                        data['House_vote_action'].append(";".join(House_vote_action1)) if len(House_vote_action1) > 1 else (data['House_vote_action'].append(House_vote_action1[0]))
                        data['House_vote_motion'].append(";".join(House_vote_motion1)) if len(House_vote_motion1) > 1 else (data['House_vote_motion'].append(House_vote_motion1[0]))  
                        data['House_vote_count'].append(";".join(House_vote_count1)) if len(House_vote_count1) > 1 else (data['House_vote_count'].append(House_vote_count1[0]))                     
                        data['House_vote_link'].append(";".join(House_vote_link1)) if len(House_vote_link1) > 1 else (data['House_vote_link'].append(House_vote_link1[0]))

                    if 'House' not in j.text:
                            data['House_vote_date'].append(np.NaN)
                            data['House_vote_action'].append(np.NaN)
                            data['House_vote_motion'].append(np.NaN)
                            data['House_vote_count'].append(np.NaN)
                            data['House_vote_link'].append(np.NaN)
                    if 'Senate' in j.text:
                        #print(j.text)
                        Senate_vote_date1 = []
                        Senate_vote_action1 = []
                        Senate_vote_motion1 = []
                        Senate_vote_count1 = []
                        Senate_vote_link1 = []
                        for senate_b_info in i.find_all('tr'):
                            if len(senate_b_info.find_all('div', class_='field field-name-field-vote-action field-type-text field-label-hidden')) != 0 and len(senate_b_info.find_all('div',
                                                                                                                                                          class_=('field field-name-field-vote-motion field-type-text field-label-hidden'))) != 0:
                                #print(senate_b_info.find('div', class_='field field-name-field-vote-action field-type-text field-label-hidden').find('div', class_="field-item even").text)
                                if (senate_b_info.find('div', class_='field field-name-field-vote-action field-type-text field-label-hidden'). find('div', class_="field-item even").text.lower()) in action_types and (senate_b_info.find('div', class_='field field-name-field-vote-motion field-type-text field-label-hidden').find('div', class_="field-item even").text.lower()) in motion_types:
                                    Senate_vote_date1.append(senate_b_info.find('div', class_='field field-name-field-date field-type-datetime field-label-hidden').text.strip())
                                    Senate_vote_action1.append(senate_b_info.find('div', class_='field field-name-field-vote-action field-type-text field-label-hidden').text.strip())
                                    Senate_vote_motion1.append(senate_b_info.find('div', class_='field field-name-field-vote-motion field-type-text field-label-hidden').text.strip())
                                    Senate_vote_count1.append(senate_b_info.find('div', class_='field field-name-field-vote-text field-type-text field-label-hidden').text.strip())
                                    Senate_vote_link1.append(f"{votes_main_link}{senate_b_info.find('a').get('href')}")
                        data['Senate_vote_date'].append(";".join(Senate_vote_date1)) if len(Senate_vote_date1) > 1 else (data['Senate_vote_date'].append(Senate_vote_date1[0]))
                        data['Senate_vote_action'].append(";".join(Senate_vote_action1)) if len(Senate_vote_action1) > 1 else (data['Senate_vote_action'].append(Senate_vote_action1[0]))
                        data['Senate_vote_motion'].append(";".join(Senate_vote_motion1)) if len(Senate_vote_motion1) > 1 else (data['Senate_vote_motion'].append(Senate_vote_motion1[0]))  
                        data['Senate_vote_count'].append(";".join(Senate_vote_count1)) if len(Senate_vote_count1) > 1 else (data['Senate_vote_count'].append(Senate_vote_count1[0])) 
                        data['Senate_vote_link'].append(";".join(Senate_vote_link1)) if len(Senate_vote_link1) > 1 else (data['Senate_vote_link'].append(Senate_vote_link1[0])) 

                    if 'Senate' not in j.text:
                            data['Senate_vote_date'].append(np.NaN)
                            data['Senate_vote_action'].append(np.NaN)
                            data['Senate_vote_motion'].append(np.NaN)
                            data['Senate_vote_count'].append(np.NaN)
                            data['Senate_vote_link'].append(np.NaN)
    else:
        data['House_vote_date'].append(np.NaN)
        data['House_vote_action'].append(np.NaN)
        data['House_vote_motion'].append(np.NaN)
        data['House_vote_count'].append(np.NaN)
        data['House_vote_link'].append(np.NaN) 
        data['Senate_vote_date'].append(np.NaN)
        data['Senate_vote_action'].append(np.NaN)
        data['Senate_vote_motion'].append(np.NaN)
        data['Senate_vote_count'].append(np.NaN)
        data['Senate_vote_link'].append(np.NaN)
In [8]:
def main(link, bill):
    get_page = requests.get(link)
    soup = BeautifulSoup(get_page.content, 'lxml')
    
    get_status = soup.find_all('div', class_='field field-name-field-label field-type-text field-label-hidden')
    bill_history1 = soup.find('div', id='bill-documents-tabs7').find_all('tbody')
    sponsers_all = soup.find('div', id='bill-documents-tabs8').find('tbody').find_all('tr')
    bill_infos = soup.find_all(id='main-content', class_='main-content')
    committee1 = soup.find_all('div', 'committee-item')
    votes_exist1 = soup.find_all('div', id='bill-documents-tabs4')
        
    get_status_attributes(soup, bill,get_status, bill_history1, sponsers_all, bill_infos, committee1, votes_exist1)  

    df = get_data(data)
    
    return df
In [9]:
if __name__ == '__main__':
    for i in y2018_r[443:444]:
        print(f'{main_link}{i}')
        final_df = main(f'{main_link}{i}',i)
https://leg.colorado.gov/bills/HJR18-1001
In [10]:
#data
In [11]:
final_df
Out[11]:
bill_number status_1 status_2 prime_sponsor sponsor Co_sponsor Title Subtitle Session Subjects ... Senate_vote_date Senate_vote_action Senate_vote_motion Senate_vote_count Senate_vote_link House_vote_date House_vote_action House_vote_motion House_vote_count House_vote_link
0 HJR18-1001 Adopted NaN [Rep. K. Becker, Sen. C. Holbert] [Rep. C. Duran, Rep. P. Neville, Sen. L. Guzman] [Sen. J. Kefalas] Message From Governor Concerning a Joint Session of the House of Rep... 2018 Regular Session [State Government] ... 01/10/2018 Consideration of Resolutions RESOLUTION Aye: 35 No: 0 Other: 0 https://leg.colorado.gov/content/hjr18-1001vot... 01/10/2018 RESOLUTIONS RES Aye: 64 No: 0 Other: 0 https://leg.colorado.gov/content/hjr18-1001vot...

1 rows × 21 columns

In [12]:
#final_df.to_excel('Data Files/check1.xlsx')
In [13]:
!jupyter nbconvert Bill_Info_CO.ipynb --to html
[NbConvertApp] Converting notebook Bill_Info_CO.ipynb to html
[NbConvertApp] Writing 366988 bytes to Bill_Info_CO.html
In [ ]: