{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "cffac1a7-6de2-4d90-8e04-65fa3b151ed7", "metadata": {}, "outputs": [], "source": [ "%matplotlib inline\n", "%config InlineBackend.figure_format = 'retina'\n", "\n", "import numpy as np\n", "import pandas as pd\n", "import seaborn as sns\n", "import matplotlib as mpl\n", "import matplotlib.pyplot as plt\n", "from sklearn.manifold import TSNE\n", "from sklearn.decomposition import PCA\n", "\n", "# Kelly = ['#af2337', '#ecc342', '#2967a0', '#2f3c28', '#96b437',\n", "# '#da93ab', '#e58932', '#80598f', '#7e331f', '#3b855a',\n", "# '#c0b286', '#a9c9ed', '#ec977f', '#848482', '#604628',\n", "# '#d26034', '#a64c6b', '#dbd245', '#eba83b', '#5d5092',\n", "# '#222222', '#f2f3f4']" ] }, { "cell_type": "code", "execution_count": 2, "id": "b5882de2-f1c9-432b-883b-1b2f285a91fa", "metadata": {}, "outputs": [], "source": [ "# make some adjustment of the plotting\n", "\n", "mpl.rcParams['figure.facecolor'] = 'white'\n", "mpl.rcParams['axes.facecolor'] = 'EAEAF2'\n", "mpl.rcParams['axes.titlesize'] = 19\n", "mpl.rcParams['axes.labelsize'] = 16\n", "mpl.rcParams['legend.fontsize'] = 13\n", "mpl.rcParams['legend.markerscale'] = 1\n", "mpl.rcParams['xtick.labelsize'] = 14\n", "mpl.rcParams['ytick.labelsize'] = 14\n", "mpl.rcParams['figure.dpi'] = 100" ] }, { "cell_type": "code", "execution_count": 3, "id": "17c26338-cf66-4497-b608-ffb818f7f781", "metadata": {}, "outputs": [], "source": [ "# load the genotype data\n", "phase3_genotype = pd.read_csv('1kg_phase3_MGI358_genotype_matrix.tsv', sep = '\\t', index_col = 0)\n", "\n", "# load the metadata\n", "# there are some emtpy columns at the end of each line, that's why usecols=[0,1,2,3]\n", "phase3_info = pd.read_csv('1kg_phase3_release_20130502_vcf/integrated_call_samples_v3.20130502.ALL.panel',\n", " sep = '\\t', index_col = 0, usecols=[0,1,2,3])\n", "\n", "# make sure the orders of samples are the same between sample_genotype and sample_info\n", "phase3_info = phase3_info.loc[phase3_genotype.columns, :]" ] }, { "cell_type": "code", "execution_count": 4, "id": "ebda4582-1e23-4e26-8ef7-2c27a1d41dcb", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | HG00096 | \n", "HG00097 | \n", "HG00099 | \n", "HG00100 | \n", "HG00101 | \n", "HG00102 | \n", "HG00103 | \n", "HG00105 | \n", "HG00106 | \n", "HG00107 | \n", "... | \n", "NA21128 | \n", "NA21129 | \n", "NA21130 | \n", "NA21133 | \n", "NA21135 | \n", "NA21137 | \n", "NA21141 | \n", "NA21142 | \n", "NA21143 | \n", "NA21144 | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
ID | \n", "\n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " |
rs826472 | \n", "2 | \n", "2 | \n", "0 | \n", "1 | \n", "0 | \n", "2 | \n", "0 | \n", "1 | \n", "2 | \n", "1 | \n", "... | \n", "2 | \n", "2 | \n", "1 | \n", "2 | \n", "2 | \n", "2 | \n", "2 | \n", "1 | \n", "2 | \n", "1 | \n", "
rs735155 | \n", "0 | \n", "2 | \n", "0 | \n", "0 | \n", "0 | \n", "2 | \n", "1 | \n", "0 | \n", "0 | \n", "2 | \n", "... | \n", "1 | \n", "2 | \n", "2 | \n", "2 | \n", "1 | \n", "2 | \n", "2 | \n", "2 | \n", "2 | \n", "0 | \n", "
2 rows × 2504 columns
\n", "\n", " | pop | \n", "super_pop | \n", "gender | \n", "
---|---|---|---|
HG00096 | \n", "GBR | \n", "EUR | \n", "male | \n", "
HG00097 | \n", "GBR | \n", "EUR | \n", "female | \n", "