Example Code...


import pandas as pd
import geopandas as gpd
from shapely.geometry import Polygon, LineString

import glob
from os import path, getcwd
from utils import ProgressBar

from matplotlib import pyplot as plt
from matplotlib.dates import DateFormatter, YearLocator
%matplotlib inline

directory='data'


# load from csv
house_price_index = pd.read_csv(path.join(directory,'CPI/UK-HPI-full-file-2020-06.csv'))
house_price_index = house_price_index.assign(Date = pd.to_datetime(house_price_index['Date'],
                                                                   format='%d/%m/%Y'))
# filter by region
house_price_index = house_price_index[house_price_index['RegionName'].isin(['Cambridge','Oxford','Inner London','Manchester',
                                                                            'Brighton and Hove','City of Edinburgh','Cardiff'])]


plt.figure(figsize=(12,5))
for region,group in house_price_index.groupby(['RegionName']):
    plt.plot(group['Date'],
             group['AveragePrice'],
             label=region)
plt.legend()
plt.title('HPI - Average Price of Properties across the UK')
plt.ylabel('Average Price (£)')
plt.xlabel('Transaction Date')
plt.xticks(rotation=-90)
plt.gca().xaxis.set_major_formatter(DateFormatter('%Y'))
plt.gca().xaxis.set_major_locator(YearLocator())
plt.show()


plt.figure(figsize=(12,5))
for region,group in house_price_index.groupby(['RegionName']):
    plt.plot(group['Date'],
             (1+group['AveragePrice'].pct_change()).cumprod(),
             label=region)
plt.legend()
plt.title('HPI - Relative Cumulative Change of Average Price of Properties across the UK')
plt.ylabel('growth_since_95')
plt.xlabel('Transaction Date')
plt.xticks(rotation=-90)
plt.gca().xaxis.set_major_formatter(DateFormatter('%Y'))
plt.gca().xaxis.set_major_locator(YearLocator())
plt.show()


house_price_index = house_price_index[house_price_index['RegionName']=='Cambridge']


plt.figure(figsize=(12,5))
for house_type in ['Detached','SemiDetached','Terraced','Flat']:
    plt.plot(house_price_index['Date'],
             house_price_index[house_type+'Price'],
             label=house_type)
plt.legend()
plt.title('HPI - Average Price of Property Types in Cambridge')
plt.ylabel('Average Price (£)')
plt.xlabel('Transaction Date')
plt.xticks(rotation=-90)
plt.gca().xaxis.set_major_formatter(DateFormatter('%Y'))
plt.gca().xaxis.set_major_locator(YearLocator())
plt.show()


plt.figure(figsize=(12,5))
for house_type in ['Detached','SemiDetached','Terraced','Flat']:
    plt.plot(group['Date'],
             (1+group[house_type+'Price'].pct_change()).cumprod(),
             label=house_type)
plt.legend()
plt.title('HPI - Relative Cumulative Change of Average Price of Property Types in Cambridge')
plt.ylabel('growth_since_95')
plt.xlabel('Transaction Date')
plt.xticks(rotation=-90)
plt.gca().xaxis.set_major_formatter(DateFormatter('%Y'))
plt.gca().xaxis.set_major_locator(YearLocator())
plt.show()


plt.figure(figsize=(12,5))
for house_type in ['Detached','SemiDetached','Terraced','Flat']:
    plt.plot(group['Date'],
             (1+group[house_type+'Price'].pct_change()),
             label=house_type)
plt.legend()
plt.title('HPI - Relative Cumulative Change of Average Price of Property Types in Cambridge')
plt.ylabel('growth_since_95')
plt.xlabel('Transaction Date')
plt.xticks(rotation=-90)
plt.gca().xaxis.set_major_formatter(DateFormatter('%Y'))
plt.gca().xaxis.set_major_locator(YearLocator())
plt.show()


# load csv file using pandas
postcode_coords = pd.concat((pd.read_csv(f, usecols=[0,2,3], header=0, names=['postcode','x','y'])
                             for f in glob.glob(path.join(directory,'Postcodes/*.csv'))))

# remove white space from postcode column
postcode_coords = postcode_coords.assign(postcode = postcode_coords['postcode'].str.replace(' ', ''))


# define crs
crs = 'PROJCS["OSGB_1936_British_National_Grid",GEOGCS["GCS_OSGB 1936",DATUM["D_OSGB_1936",SPHEROID["Airy_1830",6377563.396,299.3249646]],PRIMEM["Greenwich",0],UNIT["Degree",0.017453292519943295]],PROJECTION["Transverse_Mercator"],PARAMETER["latitude_of_origin",49],PARAMETER["central_meridian",-2],PARAMETER["scale_factor",0.9996012717],PARAMETER["false_easting",400000],PARAMETER["false_northing",-100000],UNIT["Meter",1]]'

# convert to geodataframe
postcode_coords = gpd.GeoDataFrame(postcode_coords,
                                   geometry = gpd.points_from_xy(postcode_coords['x'],
                                                                 postcode_coords['y']),
                                   crs=crs)

# clip to correct geography
postcode_coords = postcode_coords.cx[520000:575000,235000:280000]


# use chunking to manage large file, remove whitespace from Postcode column and merge with postcode_coords 
house_prices = pd.concat((pd.merge(chunk.assign(Postcode = chunk['Postcode'].str.replace(' ', '')),
                                   postcode_coords,
                                   how='inner',left_on='Postcode', right_on='postcode') 
                           for chunk in pd.read_csv(path.join(directory,
                                                              'HousePrices/LandRegistry_HousePrices_Full.csv'), 
                                                    chunksize=1e6))) 

# convert to geodataframe
house_prices = gpd.GeoDataFrame(house_prices,
                                geometry = gpd.points_from_xy(house_prices['x'],
                                                              house_prices['y']),
                                crs=crs)

# save to new csv
house_prices.to_csv(path.join(directory,'HousePrices/house_prices_full_local.csv'))


house_prices.count()

Transaction unique identifier        261703
Price                                261703
Date of Transfer                     261703
Postcode                             261703
Property Type                        261703
Old/New                              261703
Duration                             261703
PAON                                 261697
SAON                                  14752
Street                               260391
Locality                             204985
Town/City                            261703
District                             261703
County                               261703
PPD Category Type                    261703
Record Status - monthly file only    261703
postcode                             261703
x                                    261703
y                                    261703
geometry                             261703
dtype: int64


house_prices.head()


# filter by date
house_prices_latest=house_prices[(house_prices['Date of Transfer']>='2016-01-01')]


fig,ax=plt.subplots(1,3, figsize=(15,4))
ax[0].hist(house_prices_latest[house_prices_latest['Price']<1e6]['Price'], bins=50,
           alpha=0.7,ec='white')
ax[0].set_title('Prices < £1m')
ax[0].set_xlabel('Price')
ax[1].hist(house_prices_latest[(house_prices_latest['Price']>=1e6) & (house_prices_latest['Price']<1e7)]['Price'], bins=50,
           alpha=0.7,ec='white')
ax[1].set_title('£1m <= Prices < £10m')
ax[1].set_xlabel('Price')
ax[2].hist(house_prices_latest[house_prices_latest['Price']>=1e7]['Price'], bins=50,
           alpha=0.7,ec='white')
ax[2].set_title('£10m <= Prices')
ax[2].set_xlabel('Price')
plt.show()


import numpy as np

fig,ax =plt.subplots(1,2, figsize=(12,4))

ax[0].hist(house_prices_latest[(house_prices_latest['Price']>=1e5) & (house_prices_latest['Price']<1e6)]['Price'], 
           bins=50,alpha=0.7,ec='white')
ax[0].set_title('Distribution of Price')
ax[0].set_xlabel('Price')
ax[1].hist(np.log(house_prices_latest[(house_prices_latest['Price']>=1e5) & (house_prices_latest['Price']<1e6)]['Price']), 
           bins=50,alpha=0.7,ec='white')
ax[1].set_title('Distribution of log(Price)')
ax[1].set_xlabel('log(Price)')
plt.show()


house_types = {'D':'Detached','S':'SemiDetached','T':'Terraced','F':'Flat'}

fig, ax = plt.subplots(2,2,figsize=(12,8))

for k,house_type in enumerate(house_types.keys()):
    j = k % 2
    i = int((k-j)/2)
    ax[i,j].hist(np.log(house_prices_latest[(house_prices_latest['Property Type']==house_type) & \
                                            (house_prices_latest['Price']>=1e5) & \
                                            (house_prices_latest['Price']<1e6)]['Price']),
                 bins=50,alpha=0.7,ec='white')
    ax[i,j].set_title(house_types[house_type]+' Property Transactions')


house_types = {'D':'Detached','S':'SemiDetached','T':'Terraced','F':'Flat'}

fig, ax = plt.subplots(2,2,figsize=(15,10))

for k,house_type in enumerate(house_types.keys()):
    j = k % 2
    i = int((k-j)/2)
    house_prices_latest[house_prices_latest['Property Type']==house_type].plot(ax=ax[i,j],
                                                                               column='Price',
                                                                               s=2, vmin=100000, vmax=800000,
                                                                               c=house_prices_latest['Price'])
    ax[i,j].set_title(house_types[house_type]+' Property Transactions')
    ax[i,j].set_xlim((520000,575000))
    ax[i,j].set_ylim((235000,280000))
    if i==0:
        ax[i,j].get_xaxis().set_visible(False)
    if j==1:
        ax[i,j].get_yaxis().set_visible(False)
    
    
fig.colorbar(ax[1,1].collections[0], ax=ax[:,:].ravel(), shrink=1.0)

<matplotlib.colorbar.Colorbar at 0x28808ad7220>


airport_coords = pd.read_csv(path.join(directory,'Custom/CambridgeAirport.csv'))
runway_coords = pd.read_csv(path.join(directory,'Custom/CambridgeRunway.csv'))

airport = gpd.GeoDataFrame(index=[0], 
                           geometry=[Polygon(zip(airport_coords['X (Easting)'], airport_coords['Y (Northing)']))],
                           crs=crs)
runway = gpd.GeoDataFrame(index=[0], 
                          geometry=[LineString(zip(runway_coords['X (Easting)'], runway_coords['Y (Northing)']))],
                          crs=crs)


railways=gpd.read_file(path.join(directory,'railway_noise.shp'))
noisy_roads=gpd.read_file(path.join(directory,'road_noise.shp'))
end_noise=gpd.read_file(path.join(directory,'end_noise.shp'))


bus_stops = pd.read_csv(path.join(directory,'NaPTANcsv/Stops.csv'), encoding='Latin')
bus_stops = bus_stops[['NaptanCode','CommonName','Easting', 'Northing','Indicator']]
bus_stops = gpd.GeoDataFrame(bus_stops,
                             geometry=gpd.points_from_xy(bus_stops['Easting'],bus_stops['Northing']))
bus_stops = bus_stops.cx[520000:575000,235000:280000]

C:\Users\Mille\.julia\conda\3\envs\geopython\lib\site-packages\IPython\core\interactiveshell.py:3145: DtypeWarning: Columns (1,2,5,6,7,9,11,13,15,20,21,22,23,24,35,36) have mixed types.Specify dtype option on import or set low_memory=False.
  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


train_stations = pd.read_csv(path.join(directory,'NaPTANcsv/RailReferences.csv'), encoding='Latin')
train_stations = train_stations[['AtcoCode','StationName','CrsCode','Easting', 'Northing']]
train_stations = gpd.GeoDataFrame(train_stations,
                                  geometry=gpd.points_from_xy(train_stations['Easting'],train_stations['Northing']))
train_stations = train_stations.cx[520000:575000,235000:280000]


roads=gpd.read_file(path.join(directory,'os_roads_local.shp'))
greenspaces=gpd.read_file(path.join(directory,'os_greenspaces_local.shp'))
rivers=gpd.read_file(path.join(directory,'os_rivers_local.shp'))


# xmin,xmax=(520000,575000)
# ymin,ymax=(235000,280000)
xmin,xmax=(535000,560000)
ymin,ymax=(250000,270000)

fig,ax=plt.subplots(4,1,figsize=(12*float(xmax-xmin)/float(ymax-ymin),32))

for i,axis in enumerate(ax.ravel()):
    axis.set_xlim((xmin,xmax))
    axis.set_ylim((ymin,ymax))
    axis.set_xlabel('Easting')
    axis.set_xlabel('Northing')
    
    roads.plot(ax=ax[i], 
               column='formOfWay', 
               legend=(i==0),
               cmap=plt.cm.cividis, linewidth=0.5)    
    if i>=1:
        rivers.plot(ax=ax[i],
                    column='form',
                    legend=(i==1),
                    cmap=plt.cm.winter, alpha=0.5)   
    if i>=2:
        greenspaces.plot(ax=ax[i],
                         column='function',
                         legend=(i==2),
                         cmap=plt.cm.summer, linewidth=0.7)

    if i>=3:
        house_prices_latest.plot(ax=ax[i],
                                 column='Price',
                                 s=2, vmin=100000, vmax=700000, 
                                 legend=(i==3),
                                 cmap=plt.cm.inferno, alpha=0.7)
ax[0].set_title('geospatial data: Roads and Motorways')
ax[1].set_title('geospatial data: Rivers and Waterways')
ax[2].set_title('geospatial data: Parks and Greenspaces')
ax[3].set_title('geospatial data: House Prices')
plt.show()


xmin,xmax=(535000,560000)
ymin,ymax=(250000,270000)


plt.figure(figsize=(7.96*float(xmax-xmin)/float(ymax-ymin),10))
plt.xlim((xmin,xmax))
plt.ylim((ymin,ymax))



roads.plot(ax=plt.gca(), 
           column='formOfWay', 
           cmap=plt.cm.cividis, linewidth=0.5)

railways.plot(ax=plt.gca(), 
              column='noiseclass',
              legend=True,
              cmap=plt.cm.plasma, linewidth=0.5,
              alpha=0.7)


plt.scatter(x=bus_stops['Easting'],
            y=bus_stops['Northing'],
            c='orange',
            label='bus stop',
            s=3,
            alpha=0.7)

plt.scatter(x=train_stations['Easting'],
            y=train_stations['Northing'],
            c='indigo',
            label='train station',
            alpha=0.7)

airport.plot(ax=plt.gca(), 
             label='airport', 
             color='firebrick',
             alpha=0.7)

runway.plot(ax=plt.gca(), 
            label='runway',
            color='firebrick',
            alpha=0.7)

plt.legend(title='key', bbox_to_anchor=(1.05, 1), loc='upper left', fontsize='small')
plt.title('\nCombined Map of Railways, Bus Stations, and Roads\n')
plt.show()

	Transaction unique identifier	Price	Date of Transfer	Postcode	Property Type	Old/New	Duration	PAON	SAON	Street	Locality	Town/City	District	County	PPD Category Type	Record Status - monthly file only	postcode	x	y	geometry
0	{E76CCDAE-3A80-434F-99C3-91F5106F74E3}	35200	1995-06-09 00:00	SG75LW	F	N	L	THE MALTINGS	FLAT 11	GREEN LANE	ASHWELL	BALDOCK	NORTH HERTFORDSHIRE	HERTFORDSHIRE	A	A	SG75LW	527291	239988	POINT (527291.000 239988.000)
1	{68405155-0EF8-47F5-A7E7-87D70D4AFE4D}	46000	1995-06-16 00:00	SG75LW	F	N	L	THE MALTINGS	FLAT 18	GREEN LANE	ASHWELL	BALDOCK	NORTH HERTFORDSHIRE	HERTFORDSHIRE	A	A	SG75LW	527291	239988	POINT (527291.000 239988.000)
2	{CAF8C257-E364-466D-89DB-8BF83F78854B}	38650	1995-09-29 00:00	SG75LW	F	N	L	THE MALTINGS	FLAT 6	GREEN LANE	ASHWELL	BALDOCK	NORTH HERTFORDSHIRE	HERTFORDSHIRE	A	A	SG75LW	527291	239988	POINT (527291.000 239988.000)
3	{64352770-A858-4DAC-9B5E-0F3973D6E113}	42000	1995-02-20 00:00	SG75LW	F	N	L	THE MALTINGS	FLAT 9	GREEN LANE	ASHWELL	BALDOCK	NORTH HERTFORDSHIRE	HERTFORDSHIRE	A	A	SG75LW	527291	239988	POINT (527291.000 239988.000)
4	{D53B4676-DAAC-4008-B504-287263FB16CC}	43975	1995-10-06 00:00	SG75LW	F	N	L	THE MALTINGS	FLAT 19	GREEN LANE	ASHWELL	BALDOCK	NORTH HERTFORDSHIRE	HERTFORDSHIRE	A	A	SG75LW	527291	239988	POINT (527291.000 239988.000)

Data Science and Machine Learning

Show Code Cells

UK House Prices in Python - (a) Data Gathering and Visualisation¶

David Miller - September 2020 - Link to Github ¶

Setup¶

UK House Price Index¶

Data¶

Maybe boomers had it easy...¶

Insights¶

Property Types¶

Insights¶

Individual Transaction Data¶

Data¶

Postcodes¶

House Prices¶

Visualising Transactions¶

Price Distributions¶

Geospatial Distributions¶

Insights¶

Geospatial Open Data¶

Planes, Trains, and Automobiles¶

Preprocessing¶

Plotting Geopandas Data¶

Conclusions¶

Show Code Cells

UK House Prices in Python - (a) Data Gathering and Visualisation¶

David Miller - September 2020 - Link to Github¶

Setup¶

UK House Price Index¶

Data¶

Maybe boomers had it easy...¶

Insights¶

Property Types¶

Insights¶

Individual Transaction Data¶

Data¶

Postcodes¶

House Prices¶

Visualising Transactions¶

Price Distributions¶

Geospatial Distributions¶

Insights¶

Geospatial Open Data¶

Planes, Trains, and Automobiles¶

Preprocessing¶

Plotting Geopandas Data¶

Conclusions¶

David Miller - September 2020 - Link to Github ¶