In [1]:
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np

pd.__version__, np.__version__

import Quandl as quandl
import json
#KEY = '...'
#!mkdir ./.keys
#with open('./.keys/quandl-api-key.json','w') as f:
#    json.dump({'key': KEY}, f)
#with open('./.keys/quandl-api-key.json','r') as f:
#    quandl_token = json.load(f)['key']

import functools
_quandl_get = functools.partial(quandl.get, authtoken=quandl_token)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-1-8766c537cbe0> in <module>()
     16 
     17 import functools
---> 18 _quandl_get = functools.partial(quandl.get, authtoken=quandl_token)

NameError: name 'quandl_token' is not defined
In [8]:
# http://www.quandl.com/FRED-Federal-Reserve-Economic-Data/USARGDPR-Real-GDP-in-the-United-States
# http://www.quandl.com/FRED-Federal-Reserve-Economic-Data/GDP-Gross-Domestic-Product-1-Decimal
# http://www.quandl.com/FRED-Federal-Reserve-Economic-Data/FYGFD-Gross-Federal-Debt
# http://www.quandl.com/FRED-Federal-Reserve-Economic-Data/USAPOPL-Population-in-the-United-States
# http://www.quandl.com/FRED-Federal-Reserve-Economic-Data/CPIAUCSL-Consumer-Price-Index-for-All-Urban-Consumers-All-Items-USA-Inflation
import collections
_data = collections.OrderedDict()
for _key in ['FRED/USARGDPR', 'FRED/GDP', 'FRED/FYGFD', 'FRED/USAPOPL', 'FRED/CPIAUCSL']:
    _data[_key.replace('/','_')] = _quandl_get(_key)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-8-8c9edf886867> in <module>()
      7 _data = collections.OrderedDict()
      8 for _key in ['FRED/USARGDPR', 'FRED/GDP', 'FRED/FYGFD', 'FRED/USAPOPL', 'FRED/CPIAUCSL']:
----> 9     _data[_key.replace('/','_')] = _quandl_get(_key)

NameError: name '_quandl_get' is not defined
In [9]:
mpl.rcParams['figure.figsize'] = (20,4)
In [10]:
for k,v in _data.iteritems():
    v.plot(
        title=k,
        xlim=('1940','2020'),
        ylim=(0, v.max()),
        xticks=[str(x) for x in range(1949,2017,4)],
        x_compat=True)
#usargdpr.plot(), usagdp.plot(), fygfd.plot(), popl.plot()
#usargdpr.
In [11]:
_data['FRED_USARGDPR'].head(), _data['FRED_FYGFD'].head()
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-11-56efb3f5a427> in <module>()
----> 1 _data['FRED_USARGDPR'].head(), _data['FRED_FYGFD'].head()

KeyError: 'FRED_USARGDPR'
In []:
((_data['FRED_USARGDPR'].resample('A', how='mean'))
 / _data['FRED_FYGFD'].resample('A', how='mean')).plot(title='USARGDPR / FYGFD')
((_data['FRED_GDP'].resample('A', how='mean'))
 / _data['FRED_FYGFD'].resample('A', how='mean')).plot(title='GDP / FYGFD')
In []:
((_data['FRED_GDP'].resample('A', how='mean'))
 / _data['FRED_USAPOPL'].resample('A', how='mean')).plot(title='FRED_GDP / FRED_USAPOPL')
In []:
((_data['FRED_FYGFD'].resample('A', how='mean'))
 / _data['FRED_GDP'].resample('A', how='mean')).plot(title='FRED_FYGFD / FRED_GDP')
In [12]:
plot = _data['FRED_CPIAUCSL'].resample('A', how='mean').plot(
    xticks=[str(x) for x in range(1949,2017,4)],
    x_compat=True,
    title="Yearly Inflation (CPI)"
    )
plot.legend(loc='upper left')


def add_line(plot, _year, text=None):
    _max = plot.yaxis.get_view_interval()[-1]
    plot.plot((_year,_year), (0, _max),
            color='gray', linewidth=1.5, linestyle="--")
    plot.annotate(
                text if text is not None else _year,
                xy=(_year, 0),
                xycoords='data',
                xytext=(+10, +30),
                textcoords='offset points',
                fontsize=12,
                #arrowprops=dict(arrowstyle="->"), #, connectionstyle=""), #arc3,rad=.2"),
                rotation='vertical',
                verticalalignment='bottom',
                horizontalalignment='center')

for year in range(1949, 2017, 4):
    add_line(plot, str(year))

display(plot)
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-12-173a0b5b6889> in <module>()
----> 1 plot = _data['FRED_CPIAUCSL'].resample('A', how='mean').plot(
      2     xticks=[str(x) for x in range(1949,2017,4)],
      3     x_compat=True,
      4     title="Yearly Inflation (CPI)"
      5     )

KeyError: 'FRED_CPIAUCSL'
In [13]:
def get_presidents_df(data_file='./data/us_presidents.csv'):
    #!wget 'https://commondatastorage.googleapis.com/ckannet-storage/2012-05-08T122246/USPresident-Wikipedia-URLs-Thmbs-HS.csv' -O ./data/us_presidents.csv
    df = presidents = pd.read_csv(data_file)
    
    df = presidents[['President ','Took office ','Left office ']]
    df['Took office '] = pd.to_datetime(presidents['Took office '])
    df['Left office '] = pd.to_datetime(presidents['Left office '], coerce=True)
    #display(df)
    df = df.set_index('Took office ', drop=False, verify_integrity=True)
    df['term'] = df['Left office '] - df['Took office ']
    
    col = df['term']
    val = col[0]
    df['term'] = (
        col.apply(
            lambda x: x.astype('timedelta64[D]'),
            convert_dtype=False))
    col = df['term']
    val = col[0]
    val.item().days
    df['terms'] = df['term'].apply(
        lambda x: (x.item().days if x.item() else 0)
        / float(365.25*4))
    return df

#df['terms'] = (df['term'] / np.timedelta64(1, 'D')) / float(365.25*4) # pandas 0.13

df = get_presidents_df()
display(df.head())

def presidents_by_year(df=None):
    if df is None:
        df = get_presidents_df()
    for year,name in df.ix[:,['President ']].to_records():
        print(year.year, name)

def add_presidents(plot, presidents=None, yearmin=0):
    if presidents is None:
        presidents = get_presidents_df()

    for year,name in presidents.ix[str(yearmin):,['President ']].to_records():
        #print year.year, name
        add_line(plot, year, name)

def poli_plot(df, **kwargs):
    yearmin = df.index.min().year
    yearmax = 2017
    
    plot = df.plot(
        xticks=[str(x) for x in range(yearmax, yearmin,-4)],
        x_compat=True,
        **kwargs)
    plot.legend(loc='upper left')
    
    add_presidents(plot, yearmin=yearmin)
    return plot
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-13-ea7dd0b78148> in <module>()
     27 #df['terms'] = (df['term'] / np.timedelta64(1, 'D')) / float(365.25*4) # pandas 0.13
     28 
---> 29 df = get_presidents_df()
     30 display(df.head())
     31 

<ipython-input-13-ea7dd0b78148> in get_presidents_df(data_file)
     19     col = df['term']
     20     val = col[0]
---> 21     val.item().days
     22     df['terms'] = df['term'].apply(
     23         lambda x: (x.item().days if x.item() else 0)

AttributeError: 'long' object has no attribute 'days'
In []:
df = _data['FRED_CPIAUCSL']
poli_plot(df)
In []:
df = ((_data['FRED_GDP'].resample('A', how='mean')) / _data['FRED_USAPOPL'].resample('A', how='mean'))
poli_plot(df, title="GDP per capita (thousands of dollars)")
In []:
df = ((_data['FRED_FYGFD'].resample('A', how='mean')) / _data['FRED_USAPOPL'].resample('A', how='mean'))
poli_plot(df, title="Federal debt per capita (thousands of dollars)")
In []:
inflation_factor_linear = (_data['FRED_CPIAUCSL'] / _data['FRED_CPIAUCSL'].max()).resample('A', how='mean')
inflation_factor_uhh = 1 / inflation_factor_linear
cpi = inflation_factor_uhh
#display( cpi.head()) 
#display( cpi.tail())


df = ((_data['FRED_FYGFD'].resample('A', how='mean')) / _data['FRED_USAPOPL'].resample('A', how='mean'))
#print( df.columns )

scaled = (df * cpi)
#display(scaled.tail())
display(poli_plot(_data['FRED_CPIAUCSL'], title="inflation (FRED CPI UCSL)"))
plot = poli_plot(_data['FRED_CPIAUCSL'].resample('A','mean').pct_change(), title='yearly % change in inflation')
plot.axhline()

display(plot)


display(poli_plot(df, title="debt-per-capita"))
display(poli_plot(scaled, title="debt-per-capita scaled for inflation"))
plot = poli_plot(scaled.pct_change(), title="Yearly % change in debt-per-capita scaled for inflation")
plot.axhline()
display(plot)


#poli_plot(df)
#poli_plot(df * inflation_factor_linear)
In [376]:
 
Out[376]:
True
In [390]:
# TODO: add house/senate majority party
# TODO: add major wars
In [285]:
#!wget https://github.com/unitedstates/congress-legislators/raw/master/legislators-historical.yaml -O ./data/legislators-historical.yaml
#!wget https://github.com/unitedstates/congress-legislators/raw/master/legislators-current.yaml -O ./data/legislators-current.yaml
#import yaml
#data = None
#with open('./data/legislators-historical.yaml','rb') as f:
#    data = yaml.load(f)
!ls ./data
legislators-current.yaml     sample_portfolio.csv
legislators-historical.yaml  us_presidents.csv

In [286]:
import yaml
def iter_members(
    data_files=['./data/legislators-historical.yaml',
                './data/legislators-current.yaml']):
    
    for data_file in data_files:
        data = None
        with open(data_file,'rb') as f:
            data = yaml.load(f)
        for m in data:
            for t in m['terms']:
                yield (
                    t['state'],
                    t['type'],
                    t['start'],
                    t['end'],
                    t.get('party'),
                    m['name']['first'],
                    m['name']['last'],
                    m.get('bio',{}).get('gender', 'M'), # ...
                    m.get('bio',{}).get('birthday')
                )
iter_members.columns = [
    'state',
    'type',
    'start',
    'end',
    'party',
    'first',
    'last',
    'gender',
    'birthday'
]

_legislator_data = list(iter_members())
In [287]:
df = pd.DataFrame.from_records(
    _legislator_data,
    columns=iter_members.columns)
df['start'] = pd.to_datetime(df['start'])
df['end'] = pd.to_datetime(df['end'])
df['birthday'] = pd.to_datetime(df['birthday'])
df.set_index('start', drop=False, inplace=True)
display(df.head())
display(df)
state type start end party first last gender birthday
start
1789-03-04 DE sen 1789-03-04 00:00:00 1793-03-02 00:00:00 Anti-Administration Richard Bassett M 1745-04-02 00:00:00
1789-03-04 VA rep 1789-03-04 00:00:00 1791-03-03 00:00:00 None Theodorick Bland M 1742-03-21 00:00:00
1789-03-04 SC rep 1789-03-04 00:00:00 1791-03-03 00:00:00 None Aedanus Burke M 1743-06-16 00:00:00
1789-03-04 MD rep 1789-03-04 00:00:00 1791-03-03 00:00:00 None Daniel Carroll M 1730-07-22 00:00:00
1789-03-04 PA rep 1789-03-04 00:00:00 1791-03-03 00:00:00 None George Clymer M 1739-03-16 00:00:00
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 42493 entries, 1789-03-04 00:00:00 to 2013-06-10 00:00:00
Data columns (total 9 columns):
state       42493  non-null values
type        42493  non-null values
start       42493  non-null values
end         42493  non-null values
party       42032  non-null values
first       42493  non-null values
last        42493  non-null values
gender      42493  non-null values
birthday    41240  non-null values
dtypes: datetime64[ns](3), object(6)
In [288]:
col = df['party']
uniques = dict.fromkeys(col.unique())
#print(uniques)

_party_map = {}
repub, democ, other = [], [], []
for x in uniques:
    if x is not None:
        if 'Republ' in x:
            repub.append(x)
            _party_map[x] = 'Republican'
        elif 'Democr' in x:
            democ.append(x)
            _party_map[x] = 'Democrat'
        else:
            other.append(x)
            _party_map[x] = 'Other' # ...
    else:
        other.append(x)
        _party_map[x] = 'Other' # ...
        

print(len(repub), repub)
print(len(democ), democ)
print(len(other), other)
(15, ['Progressive Republican', 'Crawford Republican', 'Democratic Republican', 'Jackson Republican', 'Conservative Republican', 'Ind. Republican', 'Jacksonian Republican', 'Democrat-turned-Republican', 'Liberal Republican', 'Ind. Republican-Democrat', 'Democrat/Republican', 'Silver Republican', 'Republican', 'Independent/Republican', 'Republican-Conservative'])
(12, ['Adams Democrat', 'Ind. Democrat', 'Readjuster Democrat', 'Democrat/Independent', 'Democrat-Liberal', 'Independent Democrat', 'Democrat', 'Popular Democrat', 'Anti-Lecompton Democrat', 'Democratic and Union Labor', 'Democrat Farmer Labor', 'Union Democrat'])
(45, ['States Rights', 'Jackson', 'New Progressive', 'Union', 'Unknown', 'Nullifier', 'Liberty', 'Conservative', 'Constitutional Unionist', 'Law and Order', 'Progressive', 'Independent', 'Readjuster', 'American Labor', 'Jacksonian', 'Anti Mason', 'Union Labor', 'Unionist', 'Free Soil', 'Nonpartisan', 'Silver', 'Anti Jackson', 'Federalist', 'Anti-administration', 'Adams', 'Pro-Administration', 'Populist', 'AL', 'Liberal', 'American', 'National Greenbacker', 'Farmer-Labor', 'Prohibitionist', 'Anti-Administration', 'Pro-administration', 'Whig', 'Anti-Jacksonian', 'Coalitionist', 'Anti Jacksonian', 'Socialist', None, 'Anti Masonic', 'Unconditional Unionist', 'Ind. Whig', 'Free Silver'])

In [289]:
display(df['1949':]['party'].value_counts())
display(df[df['party']=='Liberal'])
Democrat                      8957
Republican                    6946
Independent                     22
Popular Democrat                13
New Progressive                  6
Democrat Farmer Labor            5
Democrat/Republican              2
Ind. Democrat                    2
Democrat-Liberal                 2
Democrat-turned-Republican       2
Liberal                          1
AL                               1
Democrat/Independent             1
American Labor                   1
Independent/Republican           1
Conservative                     1
Republican-Conservative          1
dtype: int64
state type start end party first last gender birthday
start
1949-01-03 NY rep 1949-01-03 00:00:00 1951-01-02 00:00:00 Liberal Franklin Roosevelt M 1914-08-17 00:00:00
In [186]:
# print(df2[df2['state'] == 'NE'][['type', 'party','first','last']].sort().to_string())
In [290]:
print('# party')
print(df[df['state'] == 'NE']['party'].value_counts(normalize=True))
print('# type')
print(df[df['state'] == 'NE']['type'].value_counts())
# party
Republican            0.719346
Democrat              0.226158
Populist              0.046322
Independent           0.002725
Liberal Republican    0.002725
dtype: float64
# type
rep    305
sen     62
dtype: int64

In [291]:
df['two_party_fail'] = df['party'].apply(lambda x: _party_map.get(x))
display( df['two_party_fail'].value_counts() )
display( df['two_party_fail'].value_counts(normalize=True) )
Democrat      19805
Republican    18294
Other          4394
dtype: int64
Democrat      0.466077
Republican    0.430518
Other         0.103405
dtype: float64
In [292]:
# objective: draw chart with per-year, per-two-party-counts
# group by year
# count factors

def start_year(x):
    return x.year - (x.year % 2)

grouper = df.groupby([start_year, 'two_party_fail'])

whoa = grouper.aggregate({'two_party_fail':len}).unstack()

display(whoa.plot())
display(whoa.head())
display(whoa.tail())

#df.pivot_table(values='two_party_fail', cols=['start'], aggfunc=len)
<matplotlib.axes.AxesSubplot at 0x9eacb910>
two_party_fail
two_party_fail Democrat Other Republican
1788 NaN 77 13
1790 NaN 68 17
1792 NaN 84 40
1794 NaN 64 65
1796 NaN 80 61
two_party_fail
two_party_fail Democrat Other Republican
2004 225 1 253
2006 263 2 219
2008 298 NaN 194
2010 215 NaN 275
2012 238 2 246
In [294]:
grouper = df.groupby([start_year, 'gender'])

whoa = grouper.aggregate({'gender':len}).unstack()
display(whoa.plot())
<matplotlib.axes.AxesSubplot at 0x4354e90>
In [306]:
# Q. how are these misleading / maybe not as helpful as they could be?
# 1. they count by start year, so they don't show the state at any given time
#    to show the state at any given time would require
#    a 'currently_serving' function
#    which, one might think could take into account standard terms/elections
#    as appropriate for rep/sen,
#    but there are special cases in mid-stream

# 2. they do not stratify by rep/sen; the counts are lumped together
#    'share_y' split by 'type' might be helpful

# ... how many hours would it take to draw these in [spreadsheet tool]
#     only to realize that you have no idea what 
#     'settings' were used to create a (very beautiful) chart?
#     ... python tools for visual studio now support 
#         something like `ipython --pylab=inline/qt`
#         ... i work on various platforms, so that's not an option for me
#         ... not sure what sort of configuration is required to get
#             anaconda ce working with this ide
#     ... ipython qt, ipython notebook
#     ... spyder ide
#     ... you can run these as scheduled jobs which generate online charts,
#         but then, still, without the source,
#         what smoke are you
# ... "you can get a good look at a t-bone steak by"