Master data analysis, visualization, and machine learning with Canadian market focus
Duration: 12-16 weeks (self-paced)
Level: Beginner to Intermediate
Prerequisites: Basic Python knowledge
import pandas as pd
import numpy as np
import requests
# Load Toronto neighbourhood data
url = "https://open.toronto.ca/dataset/neighbourhoods/"
df = pd.read_csv("neighbourhoods.csv")
# Basic data exploration
print(f"Dataset shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")
print(df.head())
# Data info and statistics
df.info()
df.describe()
Toronto Housing Price Analysis - Analyze real estate data from the Toronto Regional Real Estate Board (TRREB) to understand market trends across different neighbourhoods.
# Clean TTC ridership data
import pandas as pd
# Load TTC data
ttc_data = pd.read_csv("ttc_ridership.csv")
# Handle missing values
ttc_data['ridership'].fillna(ttc_data['ridership'].median(), inplace=True)
# Convert date column
ttc_data['date'] = pd.to_datetime(ttc_data['date'])
# Remove outliers using IQR method
Q1 = ttc_data['ridership'].quantile(0.25)
Q3 = ttc_data['ridership'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
clean_data = ttc_data[
(ttc_data['ridership'] >= lower_bound) &
(ttc_data['ridership'] <= upper_bound)
]
Toronto Crime Data Preprocessing - Clean and prepare Toronto Police Service crime data for analysis, handling inconsistencies and missing information.
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
# Load Toronto weather data
weather_df = pd.read_csv("toronto_weather_2023.csv")
# Create temperature trend plot
plt.figure(figsize=(12, 6))
plt.plot(weather_df['date'], weather_df['temperature'])
plt.title('Toronto Temperature Trends 2023')
plt.xlabel('Date')
plt.ylabel('Temperature (Β°C)')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
# Interactive plot with Plotly
fig = px.line(weather_df, x='date', y='temperature',
title='Interactive Toronto Temperature Chart')
fig.show()
Toronto Air Quality Dashboard - Create interactive visualizations of Toronto's air quality data, showing pollution trends across different seasons and locations.
import scipy.stats as stats
from scipy.stats import ttest_ind
# Compare tech salaries by education level
bachelor_salaries = toronto_tech_df[
toronto_tech_df['education'] == 'Bachelor'
]['salary']
master_salaries = toronto_tech_df[
toronto_tech_df['education'] == 'Master'
]['salary']
# Perform t-test
t_stat, p_value = ttest_ind(bachelor_salaries, master_salaries)
print(f"T-statistic: {t_stat:.4f}")
print(f"P-value: {p_value:.4f}")
if p_value < 0.05:
print("Significant difference in salaries")
else:
print("No significant difference found")
# Correlation analysis
correlation_matrix = toronto_tech_df[
['experience_years', 'salary', 'company_size']
].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Toronto Tech Career Correlation Matrix')
plt.show()
Toronto Transit Efficiency Study - Analyze TTC performance data to identify patterns and make data-driven recommendations for service improvements.
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score
# Prepare Toronto housing data
features = ['bedrooms', 'bathrooms', 'sq_ft', 'neighbourhood_score']
X = housing_df[features]
y = housing_df['price']
# Split the data
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
# Train Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
# Make predictions
predictions = rf_model.predict(X_test)
# Evaluate model
mae = mean_absolute_error(y_test, predictions)
r2 = r2_score(y_test, predictions)
print(f"Mean Absolute Error: ${mae:,.2f}")
print(f"RΒ² Score: {r2:.4f}")
# Feature importance
feature_importance = pd.DataFrame({
'feature': features,
'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)
print("\nFeature Importance:")
print(feature_importance)
Toronto Tech Job Matching System - Build a machine learning model that matches developers with suitable job opportunities based on skills, experience, and preferences.
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
# Create a simple neural network for traffic prediction
model = keras.Sequential([
layers.Dense(64, activation='relu', input_shape=(8,)),
layers.Dropout(0.3),
layers.Dense(32, activation='relu'),
layers.Dropout(0.3),
layers.Dense(1, activation='linear')
])
# Compile model
model.compile(
optimizer='adam',
loss='mse',
metrics=['mae']
)
# Train on Toronto traffic data
# Features: hour, day_of_week, weather, events, etc.
history = model.fit(
X_train_traffic, y_train_traffic,
validation_data=(X_val_traffic, y_val_traffic),
epochs=100,
batch_size=32,
verbose=1
)
# Evaluate model performance
test_loss, test_mae = model.evaluate(X_test_traffic, y_test_traffic)
print(f"Test MAE: {test_mae:.2f} minutes")
Toronto Social Media Sentiment Analysis - Build a deep learning model to analyze sentiment about Toronto tech companies from social media data.
import requests
import json
from bs4 import BeautifulSoup
# Access Toronto Open Data API
def get_toronto_data(dataset_id):
base_url = "https://ckan0.cf.opendata.inter.sandbox-toronto.ca/api/3/"
endpoint = f"action/package_show?id={dataset_id}"
response = requests.get(base_url + endpoint)
if response.status_code == 200:
return response.json()
else:
return None
# Get parking ticket data
parking_data = get_toronto_data("parking-tickets")
# Process the data
if parking_data:
resources = parking_data['result']['resources']
for resource in resources:
if resource['format'].upper() == 'CSV':
csv_url = resource['url']
df = pd.read_csv(csv_url)
print(f"Loaded {len(df)} parking ticket records")
break
# Web scraping example - Toronto tech job postings
def scrape_tech_jobs():
headers = {
'User-Agent': 'Mozilla/5.0 (compatible; DataScience-Student)'
}
# Note: Always check robots.txt and respect website policies
url = "https://example-job-site.com/toronto-tech-jobs"
response = requests.get(url, headers=headers)
if response.status_code == 200:
soup = BeautifulSoup(response.content, 'html.parser')
jobs = soup.find_all('div', class_='job-listing')
job_data = []
for job in jobs:
title = job.find('h3', class_='job-title').text.strip()
company = job.find('span', class_='company').text.strip()
salary = job.find('span', class_='salary').text.strip()
job_data.append({
'title': title,
'company': company,
'salary': salary
})
return pd.DataFrame(job_data)
return None
Canadian tech market Intelligence Dashboard - Create an automated system that collects and analyzes job postings, salary trends, and skill demands in Canada's tech market.
from flask import Flask, request, jsonify
import joblib
import pandas as pd
app = Flask(__name__)
# Load trained model
model = joblib.load('toronto_housing_model.pkl')
scaler = joblib.load('feature_scaler.pkl')
@app.route('/predict', methods=['POST'])
def predict_price():
try:
# Get data from request
data = request.get_json()
# Create DataFrame
features = pd.DataFrame([data])
# Scale features
features_scaled = scaler.transform(features)
# Make prediction
prediction = model.predict(features_scaled)[0]
return jsonify({
'predicted_price': round(prediction, 2),
'currency': 'CAD',
'status': 'success'
})
except Exception as e:
return jsonify({
'error': str(e),
'status': 'error'
})
if __name__ == '__main__':
app.run(debug=True)
Toronto Smart City Analytics Platform - Develop a comprehensive data science platform that integrates multiple Toronto datasets (traffic, weather, events, demographics) to provide insights for city planning and business decisions.
GitHub Essentials: Showcase 3-5 projects demonstrating different aspects of data science, with special emphasis on Canadian datasets and business contexts.
Networking Opportunities: Toronto Machine Learning Society (TMLS), PyData Toronto, Data Science Toronto Meetup, and Toronto AI conferences.
Join thousands of Toronto developers who are building successful data science careers with our comprehensive tutorials and community support.