First, read the input tables as pandas data frames, and filter out any unnecessary columns and rows.
# Import required modules.
import pandas as pd
import geopandas as gpd
import random
pd.set_option('display.max_columns', None)
# Set input/output local file paths - update these to match your system.
dissemination_area_lyr = r'/home/jovyan/work/data/census/lda_000b16a_e/lda_000b16a_e.shp'
census_csv = r'/home/jovyan/work/data/census/98-400-X2016055_ENG_CSV/98-400-X2016055_English_CSV_data.csv'
out_csv = r'/home/jovyan/work/blog/language-dot-map/language-dots-100.csv'
# Read dissemination areas and subset columns.
da_df = gpd.read_file(dissemination_area_lyr)
da_df = da_df[['DAUID', 'geometry']]
da_df.head()
# Read language census data in chunks, filtering to totals for disemmination areas (GEO_LEVEL 4). May take a while to complete.
census_cols = ['GEO_CODE (POR)', 'Dim: Knowledge of official languages (5): Member ID: [2]: English only', 'Dim: Knowledge of official languages (5): Member ID: [3]: French only', 'Dim: Knowledge of official languages (5): Member ID: [4]: English and French']
reader = pd.read_csv(census_csv, iterator=True, chunksize=1000)
census_df = pd.concat([
chunk.loc[
(chunk['GEO_LEVEL'] == 4) &
(chunk['DIM: Sex (3)'] == 'Total - Sex') &
(chunk['DIM: Mother tongue (269)'] == 'Total - Mother tongue'),
census_cols
] for chunk in reader
])
# Rename columns.
census_df.columns = ['DAUID', 'English', 'French', 'English and French']
census_df.head()
Next, join the two data frames by the common key DAUID to assign geometries to the census data and calculate the bounding box (bbox) for each geometry.
# Convert the IDs "object" data type to integer, to allow the merge.
da_df['DAUID'] = da_df['DAUID'].astype('int')
# Merge geometry to census data frame and convert to a geopandas data frame.
merged_df = pd.merge(census_df, da_df, on='DAUID')
merged_df = gpd.GeoDataFrame(merged_df, geometry='geometry')
# Calculate bbox for each geometry with bounds method.
merged_df = pd.concat([merged_df, merged_df.bounds], axis=1)
merged_df.head()
The basic methodology for a dot map like this is to randomly distribute point coordinates within the administration boundary, one to represent each value to be mapped. In this case, we will allocate a coordinate for every 100 people present in each language category.
To distribute the dots, a random location within the bounding box is calculated and then tested against the actual administation boundary geometry. If the coordinate falls within the geometry, it is kept, and if not, a new random location calculated, the process repeated until a coordinate is found that does intersect.
# Recalculate values as 1 per 100 persons (rounded to nearest 100).
factor = 100
merged_df[['English', 'French', 'English and French']] = merged_df[['English', 'French', 'English and French']].astype('int')
merged_df[['English', 'French', 'English and French']] = round(merged_df[['English', 'French', 'English and French']] / factor).astype('int')
# Define a function to randomly distribute coordinates within a geometry per value for each language.
def random_coordinates(row):
results = []
for language in ('English', 'French', 'English and French'):
count = 0
val = row[language]
while count < val:
x = random.uniform(row['minx'], row['maxx'])
y = random.uniform(row['miny'], row['maxy'])
pt = Point(x, y)
if pt.within(row['geometry']):
count += 1
results.append([language, x, y])
return pd.DataFrame(results, columns=('language', 'x', 'y'))
# Apply the function to every row of the data frame, returning a series with a data frame for each row. This step also takes a while to run.
results = merged_df.apply(random_coordinates, axis=1, raw=True)
# Unpack the series and concatenate the data frames.
results = pd.concat(results.tolist(), ignore_index=True)
# Write the results to file.
results.to_csv(out_csv, index=False)