{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Data understanding of Titanic dataset\n",
    "\n",
    "Analyze the *titanic2* dataset.\n",
    "\n",
    "1. Use *titanic_train.csv*.\n",
    "2. Find out size of dataset, if there is a column with ID and if there are duplicated rows.\n",
    "3. Check data quality: proper types for columns, presence of missing and unexpected values.\n",
    "4. What columns can be important and unimportant for predicting the probability of surviving? Focus on them and check:\n",
    "   - validity and distribution of their values\n",
    "   - relationship of each to the `survived` column\n",
    "5. Find out companies of passengers travelling on a common ticket (more than one person). What other columns have same value for same ticket? What are the most typical cases of such companies?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Import libraries\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "from scipy import stats\n",
    "\n",
    "# classes for special types\n",
    "from pandas.api.types import CategoricalDtype"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 1. Reading data\n",
    "Tit = pd.read_csv(\"titanic_train.csv\")\n",
    "Tit"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 2.\n",
    "# size\n",
    "print(Tit.shape) # count of rows and columns\n",
    "\n",
    "# units are passengers, it seems they can be identified by passenger_id\n",
    "print('Count of unique passenger_id: ', Tit[[\"passenger_id\"]].nunique())\n",
    "# number of unique values is equal to number of rows => ok\n",
    "\n",
    "# duplicated rows without passenger_id?\n",
    "sum(Tit.duplicated(subset=['pclass', 'name', 'sex', 'age', 'sibsp', 'parch',\n",
    "       'ticket', 'fare', 'cabin', 'embarked', 'boat', 'body', 'home.dest',\n",
    "       'survived']))\n",
    "# no duplicated rows"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 3.\n",
    "# column names and types\n",
    "print(Tit.dtypes)\n",
    "# all types seem proper except body and boat (we expect integer if it is id number)\n",
    "# `embarked` could be ordered (the order of stops is S, C, Q)\n",
    "# `pclass` could be either ordered categorial or integer\n",
    "# `sex` could be boolean (e. g. is_male)\n",
    "# `survived` should be boolean\n",
    "\n",
    "# missing values\n",
    "# relative counts (shares):\n",
    "print(1 - Tit.count()/len(Tit))\n",
    "# - age (rather a lot => problem), fare and embarked (very little => no much problem),\n",
    "# - cabin, boat and body (high but expectable => no much problem),\n",
    "# - home.dest (around a half but maybe not important)\n",
    "# - survived available for train records only\n",
    "# some columns contain much missing values but still may be useful\n",
    "# - we can use the fact of missingness as a variable/feature (e. g. for cabin or boat)\n",
    "\n",
    "# unexpected values?\n",
    "# we need to explore each variable individually"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 4.\n",
    "# for survival, the most relevant variables - intuitively - seem to be sex, age, pclass"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# sex\n",
    "freqtab = Tit.groupby('sex').agg(cetnost=('passenger_id', 'count'))\n",
    "freqtab[\"cetnost_rel\"] = freqtab[\"cetnost\"] / sum(freqtab[\"cetnost\"])\n",
    "freqtab\n",
    "# meets expectaction, mostly men"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# pclass\n",
    "freqtab = Tit.groupby('pclass').agg(cetnost=('passenger_id', 'count'))\n",
    "freqtab[\"cetnost_rel\"] = freqtab[\"cetnost\"] / sum(freqtab[\"cetnost\"])\n",
    "freqtab\n",
    "# meets expectaction"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# age\n",
    "# many missing values - for now we omit them\n",
    "# summary of statistical characteristics (by pandas)\n",
    "Tit['age'].describe()\n",
    "# for the sake of binning, thresholds 20, 30, 40 seem reasonable"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# advanced statistics - scipy\n",
    "print('Mode:', stats.mode(Tit['age'].dropna()))\n",
    "# Most often age = 18\n",
    "print('Trimmed mean (5+5 % margins off):', stats.trim_mean(Tit['age'].dropna(), 0.05))\n",
    "# Trimmed mean is very near to standard - looks like no big outliers or skewness\n",
    "print('IQR:', stats.iqr(Tit['age'].dropna()))\n",
    "print('Skewness:', stats.skew(Tit['age'].dropna()))\n",
    "# almost no skewness\n",
    "print('Kurtosis:', stats.kurtosis(Tit['age'].dropna()))\n",
    "# tails similar to normal"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# is age distrubuted normally?\n",
    "test_result = stats.normaltest(Tit['age'], nan_policy='omit')\n",
    "print(test_result) # rejected\n",
    "\n",
    "test_result = stats.shapiro(Tit['age'].dropna())\n",
    "print(test_result) # rejected"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# survival and relationships to it\n",
    "freqtab = Tit.groupby('survived').agg(cetnost=('passenger_id', 'count'))\n",
    "freqtab[\"cetnost_rel\"] = freqtab[\"cetnost\"] / sum(freqtab[\"cetnost\"])\n",
    "print(freqtab)\n",
    "# around 2/3 dead - meets expectation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# survival vs. sex\n",
    "print('\\n', pd.crosstab(Tit[\"survived\"], Tit[\"sex\"])) # absolute\n",
    "print(pd.crosstab(Tit[\"survived\"], Tit[\"sex\"], normalize=\"columns\")) # relative by columns\n",
    "# survival rate for women and men differ very much\n",
    "\n",
    "# we can do chi-square test for contingency table (the upper one)\n",
    "test_result = stats.chi2_contingency(pd.crosstab(Tit[\"survived\"], Tit[\"sex\"]))\n",
    "print(test_result) # rejected that rows and columns may be independent"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# survival vs. pclass\n",
    "print('\\n', pd.crosstab(Tit[\"survived\"], Tit[\"pclass\"])) # absolute\n",
    "print(pd.crosstab(Tit[\"survived\"], Tit[\"pclass\"], normalize=\"columns\")) # relative by columns\n",
    "# again, survival rate for classes differ\n",
    "\n",
    "test_result = stats.chi2_contingency(pd.crosstab(Tit[\"survived\"], Tit[\"pclass\"]))\n",
    "print(test_result) # rejected that rows and columns may be independent"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# survival vs. age\n",
    "freqtab = Tit.groupby('survived').agg(age_mean=('age', 'mean'))\n",
    "print('\\n', freqtab)\n",
    "# it seems that older people survived a little more often than younger ones\n",
    "\n",
    "# we make a t-test - are mean ages of survived and dead the same?\n",
    "test_result = stats.ttest_ind(Tit[Tit['survived']==0]['age'], Tit[Tit['survived']==1]['age'],\n",
    "                              equal_var=False, nan_policy='omit')\n",
    "print(test_result) # accepted\n",
    "print(test_result.confidence_interval()) # conf. interval for mean difference\n",
    "# but... age of passengers may be related to class and gender, so we have to take it into account\n",
    "\n",
    "# Levene test - is age variance of survived and dead the same?\n",
    "test_result = stats.levene(Tit[Tit['survived']==0]['age'].dropna(), Tit[Tit['survived']==1]['age'].dropna())\n",
    "print(test_result) # borderly accept"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# we can bin age and compare two categorial variables\n",
    "Tit['age_cat'] = pd.cut(Tit['age'], [0, 3, 10, 20, 35, 50, 1000],\n",
    "                       labels=['0-3', '3-10', '10-20', '20-35', '35-50', '50+'])\n",
    "print(pd.crosstab(Tit[\"survived\"], Tit[\"age_cat\"], normalize=\"columns\")) # relative by columns\n",
    "\n",
    "print('\\nChi square independency test:')\n",
    "test_result = stats.chi2_contingency(pd.crosstab(Tit[\"survived\"], Tit[\"age_cat\"]))\n",
    "print(test_result) # survival rate in particular age categories may be different"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 5. Companies on common tickets\n",
    "\n",
    "print('Count of non-missing ticket ids:', Tit[\"ticket\"].count())\n",
    "print('Count of unique ticket ids:', Tit[\"ticket\"].nunique())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# new DataFrame - ticket and person count per ticket\n",
    "Tck = Tit.groupby(\"ticket\").agg(count=(\"ticket\", \"count\"))\n",
    "Tck.reset_index(inplace=True)\n",
    "Tck = Tck.rename(columns = {'index':'ticket'})\n",
    "\n",
    "print('Count of tickets with more than 1 person:', (Tck[\"count\"] > 1).sum())\n",
    "print('Count of persons in companies:', Tck[Tck[\"count\"] > 1][\"count\"].sum())\n",
    "\n",
    "# distribution of person count on the common ticket\n",
    "print('Count of tickets by persons on the same ticket:\\n', Tck[\"count\"].value_counts())\n",
    "# majority of companies: 2 persons company"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# new DataFrame: passengers travelling in a company (sharing ticket with someone else)\n",
    "Tit2 = pd.merge(Tit, Tck[Tck[\"count\"] > 1], how=\"inner\", on=\"ticket\")\n",
    "Tit2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# What other columns have same value for same ticket?\n",
    "# - fare, embarkment, ...?\n",
    "# What are the most typical cases of such companies?\n",
    "# - gender/age combinations, ..."
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.18"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}