pandas Aug 2020
Data updated monthly
Bike Stats Code¶
Code to support the analysis in the notebook Bike-Stats.ipynb.
from IPython.core.display import HTML
from typing import Iterator, Iterable, Tuple, List, Dict
from collections import namedtuple
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re
Reading Data: rides, yearly, and daily¶
I saved a bunch of my recorded Strava rides, most of them longer than 25 miles, as bikerides.tsv. The tab-separated columns are: the date; the year; a title; the elapsed time of the ride; the length of the ride in miles; and the total climbing in feet, e.g.:
Mon, 10/5/2020 Half way around the bay on bay trail 6:26:35 80.05 541
I parse the file into the pandas dataframe rides, adding derived columns for miles per hour, vertical meters climbed per hour (VAM), grade in feet per mile, grade in percent, and kilometers ridden:
def parse_rides(lines):
"""Parse a bikerides.tsv file."""
return drop_index(add_ride_columns(pd.read_table(lines, comment='#',
converters=dict(hours=parse_hours, feet=parse_int))))
def parse_hours(time: str) -> float:
"""Parse '4:30:00' => 4.5 hours."""
hrs = sum(int(x) * 60 ** (i - 2)
for i, x in enumerate(reversed(time.split(':'))))
return round(hrs, 2)
def parse_int(field: str) -> int: return int(field.replace(',', '').replace('ft', '').replace('mi', ''))
def add_ride_columns(rides) -> pd.DataFrame:
"""Compute new columns from existing ones."""
mi, hr, ft = rides['miles'], rides['hours'], rides['feet']
if 'date' in rides and 'year' not in rides:
rides.insert(1, "year", [int(str(d).split('/')[-1]) for d in rides['date'].tolist()])
return rides.assign(
mph=round(mi / hr, 2),
vam=round(ft / hr / 3.28084),
fpmi=round(ft / mi),
pct=round(ft / mi * 100 / 5280, 2),
kms=round(mi * 1.609, 2),
meters=round(ft * 0.3048))
def drop_index(frame) -> pd.DataFrame:
"""Drop the index column."""
frame.index = [''] * len(frame)
return frame
rides = parse_rides(open('bikerides.tsv'))
yearly = parse_rides(open('bikeyears.tsv')).drop(columns='date')
daily = yearly.copy()
for name in 'hours miles feet kms meters'.split():
daily[name] = round(daily[name].map(lambda x: x / (6 * 50)), 1)
Reading Data: segments and tiles¶
I picked some representative climbing segments (bikesegments.csv) with the segment length in miles and climb in feet, along with several of my times on the segment. A line like
Old La Honda, 2.98, 1255, 28:49, 34:03, 36:44
means that this segment of Old La Honda Rd is 2.98 miles long, 1255 feet of climbing, and I've selected three times for my rides on that segment: the fastest, middle, and slowest of the times that Strava shows. (However, I ended up dropping the slowest time in the charts to make them less busy.)
I keep track of percentage of roads ridden in various places in 'bikeplaceshort.csv', which comes from wandrer.earth.
def parse_segments(lines) -> pd.DataFrame:
"""Parse segments into rides. Each ride is a tuple of:
(segment_title, time, miles, feet_climb)."""
records = []
for segment in lines:
title, mi, ft, *times = segment.split(',')[:5]
for time in times:
records.append((title, parse_hours(time), float(mi), parse_int(ft)))
return add_ride_columns(pd.DataFrame(records, columns=('title', 'hours', 'miles', 'feet')))
def make_clickable(comment) -> str:
"""Make a clickable link for a pandas dataframe."""
if '!' not in comment:
return comment
anchor, number = comment.split('!')
return f'<a href="https://www.strava.com/activities/{number}" rel="noopener noreferrer" target="_blank">{anchor}</a>'
def link_date(date) -> str:
"""Make the date into a clickable link."""
m, d, y = date.split('/')
return f'<a href="https://www.statshunters.com/?to={y}{m}{d}" rel="noopener noreferrer" target="_blank">{date}</a>'
segments = parse_segments(open('bikesegments.csv'))
tiles = drop_index(pd.DataFrame(columns='date square cluster total comment'.split(), data=[
('01/01/2025', 14, 1395, 3520, 'Start of 2025'),
('09/21/2024', 14, 1394, 3496, 'Michael J. Fox ride in Sonoma!12470434052'),
('04/28/2024', 14, 1275, 3382, 'Livermore!11287081291'),
('02/25/2024', 14, 1196, 3279, 'Expanding through Santa Cruz and to the South!10838162005'),
('01/01/2024', 14, 1056, 3105, 'Start of 2024'),
('12/08/2023', 14, 1042, 3084, 'Benicia ride connects East Bay and Napa clusters!10350071201'),
('11/05/2023', 14, 932, 2914, 'Alum Rock ride gets 14x14 max square!8850905872'),
('06/30/2023', 13, 689, 2640, 'Rides in east Bay fill in holes!9298603815'),
('04/14/2023', 13, 630, 2595, 'Black Sands Beach low-tide hike connects Marin to max cluster!8891171008'),
('03/04/2023', 13, 583, 2574, 'Almaden rides connects Gilroy to max cluster!8654437264'),
('10/22/2022', 13, 396, 2495, 'Alviso levees to get to 13x13 max square!8003921626'),
('10/16/2022', 12, 393, 2492, 'Milpitas ride connects East Bay to max cluster!7974994605'),
('09/08/2022', 11, 300, 2487, 'First started tracking tiles')])
).style.format({'comment': make_clickable, 'date': link_date})
Plotting and Curve-Fitting¶
plt.rcParams["figure.figsize"] = (12, 6)
def show(X, Y, data, title='', degrees=(2, 3)):
"""Plot X versus Y and a best fit curve to it, with some bells and whistles."""
grid(); plt.ylabel(Y); plt.xlabel(X); plt.title(title)
plt.scatter(X, Y, data=data, c='grey', marker='+')
X1 = np.linspace(min(data[X]), max(data[X]), 100)
for degree in degrees:
F = poly_fit(data[X], data[Y], degree)
plt.plot(X1, [F(x) for x in X1], '-')
def grid(axis='both'):
"Turn on the grid."
plt.minorticks_on()
plt.grid(which='major', ls='-', alpha=3/4, axis=axis)
plt.grid(which='minor', ls=':', alpha=1/2, axis=axis)
def poly_fit(X, Y, degree: int) -> callable:
"""The polynomial function that best fits the X,Y vectors."""
coeffs = np.polyfit(X, Y, degree)[::-1]
return lambda x: sum(c * x ** i for i, c in enumerate(coeffs))
estimator = poly_fit(rides['feet'] / rides['miles'],
rides['miles'] / rides['hours'], 2)
def estimate(miles, feet, estimator=estimator) -> float:
"""Given a ride distance in miles and total climb in feet, estimate time in minutes."""
return round(60 * miles / estimator(feet / miles))
def top(frame, field, n=20): return drop_index(frame.sort_values(field, ascending=False).head(n))
Wandrer Places¶
def mapl(f, *values): return list(map(f, *values))
places = drop_index(pd.read_table(open('bikeplaceshort.csv'), sep=',', comment='#'))
def wandrer(places=places, by=['pct', 'name'], ascending=[False, True], county=None):
"All those who wander are not lost." # Also try by=['county', 'pct']
if county:
places = places[places.county == county]
F = drop_index(places.sort_values(by=by, ascending=ascending))
pd.set_option('display.max_rows', None)
return pd.DataFrame(
{'name': F['name'],
'county': F['county'],
'total': F['miles'],
'done': [rounded(m * p / 100) for m, p in zip(F['miles'], F['pct'])],
'pct': [pretty_pct(p) for p in F['pct']],
'badge': [badge(float(p)) for p in F['pct']],
'to next badge': [to_go(p, m) for p, m in zip(F['pct'], F['miles'])],
'to big badge': [to_go(p, m, {25: .25, 90: .50}) for p, m in zip(F['pct'], F['miles'])]
})
def pretty_pct(pct) -> str:
return '100%' if pct == 100 else f'{pct:.2f}%' if pct > 1 else f'{pct:.4f}%'
def badge(pct) -> str:
"""What badge has this got us?"""
for badge in (99, 90, 75, 50, 25):
if pct >= badge:
return f'{badge}%'
return 'none'
bonuses = {0.02: 0, 0.1: 0, 0.2: 0, 1: 0, 2: 0, 25: .25, 50: .05, 75: .10, 90: .50, 99: .10}
def to_go(pct, miles, bonuses=bonuses) -> str:
"""Describe next target to hit to get a badge."""
done = pct * miles / 100
for b in bonuses:
if done < b / 100 * miles:
delta = b / 100 * miles - done
return f'{rounded(delta):>5} mi to {b}% ({rounded(bonuses[b] * miles + delta)} points)'
return ''
def rounded(x: float) -> str:
"""Round x to 3 spaces wide (if possible)."""
return (rounded(x/1e6) + 'M' if x > 1e6
else f'{x/1e6:4.2f}M' if x > 1e5
else f'{round(x):,d}' if x > 10
else f'{x:.1f}')
other_places = places[~places.county.isin(['---', 'SMC', 'SCC', 'SFC', 'ALA'])]
SMC / SCC Leaders¶
def make_leaders(raw_data):
"""Make a dataframe of leaders in two counties."""
data = [(name, SMp, SCp, *county_miles(SMp, SCp), round((SMp * SCp) ** 0.5, 2), round((SMp + SCp) / 2, 2), initials(name))
for (name, SMp, SCp) in raw_data]
leaders = pd.DataFrame(data, columns=[
'Name', 'SMC %', 'SCC %', 'SMC miles', 'SCC miles', 'Total miles', 'GeoMean %', 'Mean %', 'Initials'])
return drop_index(leaders)
def county_miles(SMp, SCp) -> list:
SMmiles = round(2827.3 * SMp / 100)
SCmiles = round(7688.7 * SCp / 100)
return [SMmiles, SCmiles, SMmiles + SCmiles]
def initials(name: str) -> str:
"""First and last initials."""
return name[0] + name.split()[-1][0]
def plot_leaders(leaders, by='Mean %'):
leaders = leaders.sort_values(by=by, ascending=False)
ax = leaders.plot('SMC %', 'SCC %', kind='scatter', marker='D')
front = sorted((x, y) for i, (_, _, x, y, *_) in leaders.iterrows())
"ax.axis('square')"; grid()
ax.set_xlabel('San Mateo County %')
ax.set_ylabel('Santa Clara County %')
for i, (name, x, y, *_) in leaders.iterrows():
ax.text(x + 0.7, y - 0.2, initials(name))
return leaders
leaders = make_leaders([ # Data as of Mar 24, 2025 (Name, Initials, SMC, SCC)
('Megan Gardner', 99.94, 25.21),
('Matthew Ring', 83.16, 2.48),
('Peter Norvig', 78.80, 38.83),
('Barry Mann', 78.27, 31.09),
('Catherine Kircos', 54.47, 16.04),
('Elliot Hoff', 52.89, 6.13),
('Greogory P. Smith', 51.37, 23.27),
('Brian Feinberg', 36.76, 48.22),
('Chris Okeefe', 32.17, 48.30),
('Jason Molenda', 7.60, 56.15),
('Jim Brooks', 6.17, 53.51),
])
Eddington Number¶
def Ed_number(rides, units) -> int:
"""Eddington number: The maximum integer e such that you have bicycled
a distance of at least e on at least e days."""
distances = sorted(rides[units], reverse=True)
return max(e for e, d in enumerate(distances, 1) if d >= e)
def Ed_gap(distances, target) -> int:
"""The number of rides needed to reach an Eddington number target."""
return target - count(distances >= target)
def Ed_gaps(rides, N=9) -> dict:
"""A table of gaps to Eddington numbers by year."""
E_km, E_mi = Ed_number(rides, 'kms') + 1, Ed_number(rides, 'miles') + 1
data = [(E_km + d, Ed_gap(rides.kms, E_km + d), E_mi + d, Ed_gap(rides.miles, E_mi + d))
for d in range(N)]
df = pd.DataFrame(data, columns=['kms', 'kms gap', 'miles', 'miles gap'])
return drop_index(df)
def Ed_progress(rides, years=range(2024, 2013, -1)) -> pd.DataFrame:
"""A table of Eddington numbers by year, and a plot."""
def Ed(year, unit): return Ed_number(rides[rides['year'] <= year], unit)
data = [(y, Ed(y, 'kms'), Ed(y, 'miles')) for y in years]
df = pd.DataFrame(data, columns=['year', 'Ed_km', 'Ed_mi'])
return drop_index(df)
def count_rides(rides, unit='kms', distance=100) -> int:
return count(rides[unit] > distance)
count = sum