{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Data understanding of Titanic dataset\n", "\n", "Analyze the *titanic2* dataset.\n", "\n", "1. Use *titanic_train.csv*.\n", "2. Find out size of dataset, if there is a column with ID and if there are duplicated rows.\n", "3. Check data quality: proper types for columns, presence of missing and unexpected values.\n", "4. What columns can be important and unimportant for predicting the probability of surviving? Focus on them and check:\n", " - validity and distribution of their values\n", " - relationship of each to the `survived` column\n", "5. Find out companies of passengers travelling on a common ticket (more than one person). What other columns have same value for same ticket? What are the most typical cases of such companies?" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Import libraries\n", "import pandas as pd\n", "import numpy as np\n", "from scipy import stats\n", "\n", "# classes for special types\n", "from pandas.api.types import CategoricalDtype" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 1. Reading data\n", "Tit = pd.read_csv(\"titanic_train.csv\")\n", "Tit" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 2.\n", "# size\n", "print(Tit.shape) # count of rows and columns\n", "\n", "# units are passengers, it seems they can be identified by passenger_id\n", "print('Count of unique passenger_id: ', Tit[[\"passenger_id\"]].nunique())\n", "# number of unique values is equal to number of rows => ok\n", "\n", "# duplicated rows without passenger_id?\n", "sum(Tit.duplicated(subset=['pclass', 'name', 'sex', 'age', 'sibsp', 'parch',\n", " 'ticket', 'fare', 'cabin', 'embarked', 'boat', 'body', 'home.dest',\n", " 'survived']))\n", "# no duplicated rows" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 3.\n", "# column names and types\n", "print(Tit.dtypes)\n", "# all types seem proper except body and boat (we expect integer if it is id number)\n", "# `embarked` could be ordered (the order of stops is S, C, Q)\n", "# `pclass` could be either ordered categorial or integer\n", "# `sex` could be boolean (e. g. is_male)\n", "# `survived` should be boolean\n", "\n", "# missing values\n", "# relative counts (shares):\n", "print(1 - Tit.count()/len(Tit))\n", "# - age (rather a lot => problem), fare and embarked (very little => no much problem),\n", "# - cabin, boat and body (high but expectable => no much problem),\n", "# - home.dest (around a half but maybe not important)\n", "# - survived available for train records only\n", "# some columns contain much missing values but still may be useful\n", "# - we can use the fact of missingness as a variable/feature (e. g. for cabin or boat)\n", "\n", "# unexpected values?\n", "# we need to explore each variable individually" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 4.\n", "# for survival, the most relevant variables - intuitively - seem to be sex, age, pclass" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# sex\n", "freqtab = Tit.groupby('sex').agg(cetnost=('passenger_id', 'count'))\n", "freqtab[\"cetnost_rel\"] = freqtab[\"cetnost\"] / sum(freqtab[\"cetnost\"])\n", "freqtab\n", "# meets expectaction, mostly men" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# pclass\n", "freqtab = Tit.groupby('pclass').agg(cetnost=('passenger_id', 'count'))\n", "freqtab[\"cetnost_rel\"] = freqtab[\"cetnost\"] / sum(freqtab[\"cetnost\"])\n", "freqtab\n", "# meets expectaction" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# age\n", "# many missing values - for now we omit them\n", "# summary of statistical characteristics (by pandas)\n", "Tit['age'].describe()\n", "# for the sake of binning, thresholds 20, 30, 40 seem reasonable" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# advanced statistics - scipy\n", "print('Mode:', stats.mode(Tit['age'].dropna()))\n", "# Most often age = 18\n", "print('Trimmed mean (5+5 % margins off):', stats.trim_mean(Tit['age'].dropna(), 0.05))\n", "# Trimmed mean is very near to standard - looks like no big outliers or skewness\n", "print('IQR:', stats.iqr(Tit['age'].dropna()))\n", "print('Skewness:', stats.skew(Tit['age'].dropna()))\n", "# almost no skewness\n", "print('Kurtosis:', stats.kurtosis(Tit['age'].dropna()))\n", "# tails similar to normal" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# is age distrubuted normally?\n", "test_result = stats.normaltest(Tit['age'], nan_policy='omit')\n", "print(test_result) # rejected\n", "\n", "test_result = stats.shapiro(Tit['age'].dropna())\n", "print(test_result) # rejected" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# survival and relationships to it\n", "freqtab = Tit.groupby('survived').agg(cetnost=('passenger_id', 'count'))\n", "freqtab[\"cetnost_rel\"] = freqtab[\"cetnost\"] / sum(freqtab[\"cetnost\"])\n", "print(freqtab)\n", "# around 2/3 dead - meets expectation" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# survival vs. sex\n", "print('\\n', pd.crosstab(Tit[\"survived\"], Tit[\"sex\"])) # absolute\n", "print(pd.crosstab(Tit[\"survived\"], Tit[\"sex\"], normalize=\"columns\")) # relative by columns\n", "# survival rate for women and men differ very much\n", "\n", "# we can do chi-square test for contingency table (the upper one)\n", "test_result = stats.chi2_contingency(pd.crosstab(Tit[\"survived\"], Tit[\"sex\"]))\n", "print(test_result) # rejected that rows and columns may be independent" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# survival vs. pclass\n", "print('\\n', pd.crosstab(Tit[\"survived\"], Tit[\"pclass\"])) # absolute\n", "print(pd.crosstab(Tit[\"survived\"], Tit[\"pclass\"], normalize=\"columns\")) # relative by columns\n", "# again, survival rate for classes differ\n", "\n", "test_result = stats.chi2_contingency(pd.crosstab(Tit[\"survived\"], Tit[\"pclass\"]))\n", "print(test_result) # rejected that rows and columns may be independent" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# survival vs. age\n", "freqtab = Tit.groupby('survived').agg(age_mean=('age', 'mean'))\n", "print('\\n', freqtab)\n", "# it seems that older people survived a little more often than younger ones\n", "\n", "# we make a t-test - are mean ages of survived and dead the same?\n", "test_result = stats.ttest_ind(Tit[Tit['survived']==0]['age'], Tit[Tit['survived']==1]['age'],\n", " equal_var=False, nan_policy='omit')\n", "print(test_result) # accepted\n", "print(test_result.confidence_interval()) # conf. interval for mean difference\n", "# but... age of passengers may be related to class and gender, so we have to take it into account\n", "\n", "# Levene test - is age variance of survived and dead the same?\n", "test_result = stats.levene(Tit[Tit['survived']==0]['age'].dropna(), Tit[Tit['survived']==1]['age'].dropna())\n", "print(test_result) # borderly accept" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# we can bin age and compare two categorial variables\n", "Tit['age_cat'] = pd.cut(Tit['age'], [0, 3, 10, 20, 35, 50, 1000],\n", " labels=['0-3', '3-10', '10-20', '20-35', '35-50', '50+'])\n", "print(pd.crosstab(Tit[\"survived\"], Tit[\"age_cat\"], normalize=\"columns\")) # relative by columns\n", "\n", "print('\\nChi square independency test:')\n", "test_result = stats.chi2_contingency(pd.crosstab(Tit[\"survived\"], Tit[\"age_cat\"]))\n", "print(test_result) # survival rate in particular age categories may be different" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 5. Companies on common tickets\n", "\n", "print('Count of non-missing ticket ids:', Tit[\"ticket\"].count())\n", "print('Count of unique ticket ids:', Tit[\"ticket\"].nunique())" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# new DataFrame - ticket and person count per ticket\n", "Tck = Tit.groupby(\"ticket\").agg(count=(\"ticket\", \"count\"))\n", "Tck.reset_index(inplace=True)\n", "Tck = Tck.rename(columns = {'index':'ticket'})\n", "\n", "print('Count of tickets with more than 1 person:', (Tck[\"count\"] > 1).sum())\n", "print('Count of persons in companies:', Tck[Tck[\"count\"] > 1][\"count\"].sum())\n", "\n", "# distribution of person count on the common ticket\n", "print('Count of tickets by persons on the same ticket:\\n', Tck[\"count\"].value_counts())\n", "# majority of companies: 2 persons company" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# new DataFrame: passengers travelling in a company (sharing ticket with someone else)\n", "Tit2 = pd.merge(Tit, Tck[Tck[\"count\"] > 1], how=\"inner\", on=\"ticket\")\n", "Tit2" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# What other columns have same value for same ticket?\n", "# - fare, embarkment, ...?\n", "# What are the most typical cases of such companies?\n", "# - gender/age combinations, ..." ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.18" } }, "nbformat": 4, "nbformat_minor": 1 }