First, read the input tables as pandas data frames, and filter out any unnecessary columns and rows.

In [1]:
# Import required modules.
import pandas as pd
import geopandas as gpd
import random
pd.set_option('display.max_columns', None)

# Set input/output local file paths - update these to match your system.
dissemination_area_lyr = r'/home/jovyan/work/data/census/lda_000b16a_e/lda_000b16a_e.shp'
census_csv = r'/home/jovyan/work/data/census/98-400-X2016055_ENG_CSV/98-400-X2016055_English_CSV_data.csv'
out_csv = r'/home/jovyan/work/blog/language-dot-map/language-dots-100.csv'

# Read dissemination areas and subset columns.
da_df = gpd.read_file(dissemination_area_lyr)
da_df = da_df[['DAUID', 'geometry']]
da_df.head()
Out[1]:
DAUID geometry
0 10010244 POLYGON ((8976851.149 2149576.543, 8976818.149...
1 10010245 POLYGON ((8977202.180 2150836.794, 8977136.277...
2 10010246 POLYGON ((8977549.383 2150892.566, 8977492.269...
3 10010247 POLYGON ((8977682.314 2151083.183, 8977689.440...
4 10010248 POLYGON ((8978152.474 2151142.586, 8978040.654...
In [2]:
# Read language census data in chunks, filtering to totals for disemmination areas (GEO_LEVEL 4). May take a while to complete.
census_cols = ['GEO_CODE (POR)', 'Dim: Knowledge of official languages (5): Member ID: [2]: English only', 'Dim: Knowledge of official languages (5): Member ID: [3]: French only', 'Dim: Knowledge of official languages (5): Member ID: [4]: English and French']
reader = pd.read_csv(census_csv, iterator=True, chunksize=1000)
census_df = pd.concat([
    chunk.loc[
        (chunk['GEO_LEVEL'] == 4) &
        (chunk['DIM: Sex (3)'] == 'Total - Sex') &
        (chunk['DIM: Mother tongue (269)'] == 'Total - Mother tongue'),
        census_cols
    ] for chunk in reader
])

# Rename columns.
census_df.columns = ['DAUID', 'English', 'French', 'English and French']
census_df.head()
Out[2]:
DAUID English French English and French
3228 10010734 150 0 0
4842 10010735 350 0 5
5649 10010736 130 0 0
7263 10010733 65 0 0
8877 10010737 360 0 5

Next, join the two data frames by the common key DAUID to assign geometries to the census data and calculate the bounding box (bbox) for each geometry.

In [3]:
# Convert the IDs "object" data type to integer, to allow the merge.
da_df['DAUID'] = da_df['DAUID'].astype('int')

# Merge geometry to census data frame and convert to a geopandas data frame.
merged_df = pd.merge(census_df, da_df, on='DAUID')
merged_df = gpd.GeoDataFrame(merged_df, geometry='geometry')

# Calculate bbox for each geometry with bounds method. 
merged_df = pd.concat([merged_df, merged_df.bounds], axis=1)
merged_df.head()
Out[3]:
DAUID English French English and French geometry minx miny maxx maxy
0 10010734 150 0 0 POLYGON ((9001157.471 2050664.529, 9001538.834... 9.000284e+06 2.049439e+06 9.001539e+06 2.051208e+06
1 10010735 350 0 5 POLYGON ((8991279.169 2051342.151, 8991366.254... 8.989193e+06 2.041993e+06 8.997097e+06 2.051519e+06
2 10010736 130 0 0 POLYGON ((8992311.551 2054468.074, 8993996.040... 8.985925e+06 2.042707e+06 8.995211e+06 2.054494e+06
3 10010733 65 0 0 POLYGON ((8985241.431 2028561.543, 8984900.929... 8.984089e+06 2.028226e+06 8.985462e+06 2.029444e+06
4 10010737 360 0 5 MULTIPOLYGON (((9014579.246 2070080.857, 90145... 8.995169e+06 2.066037e+06 9.015155e+06 2.107619e+06

The basic methodology for a dot map like this is to randomly distribute point coordinates within the administration boundary, one to represent each value to be mapped. In this case, we will allocate a coordinate for every 100 people present in each language category.

To distribute the dots, a random location within the bounding box is calculated and then tested against the actual administation boundary geometry. If the coordinate falls within the geometry, it is kept, and if not, a new random location calculated, the process repeated until a coordinate is found that does intersect.

In [4]:
# Recalculate values as 1 per 100 persons (rounded to nearest 100).
factor = 100
merged_df[['English', 'French', 'English and French']] = merged_df[['English', 'French', 'English and French']].astype('int')
merged_df[['English', 'French', 'English and French']] = round(merged_df[['English', 'French', 'English and French']] / factor).astype('int')

# Define a function to randomly distribute coordinates within a geometry per value for each language.
def random_coordinates(row):
    results = []
    for language in ('English', 'French', 'English and French'):
        count = 0
        val = row[language]
        while count < val:
            x = random.uniform(row['minx'], row['maxx'])
            y = random.uniform(row['miny'], row['maxy'])
            pt = Point(x, y)
            if pt.within(row['geometry']):
                count += 1
                results.append([language, x, y])
    return pd.DataFrame(results, columns=('language', 'x', 'y'))

# Apply the function to every row of the data frame, returning a series with a data frame for each row. This step also takes a while to run.
results = merged_df.apply(random_coordinates, axis=1, raw=True)

# Unpack the series and concatenate the data frames.
results = pd.concat(results.tolist(), ignore_index=True)

# Write the results to file.
results.to_csv(out_csv, index=False)