added joins

2f2c443c · Sanjay Krishnan · 8299597e · 2f2c443c
Commit 2f2c443c authored Apr 14, 2021 by Sanjay Krishnan
Showing with 1147 additions and 0 deletions
inclass/Joins in Python.ipynb
--- a/inclass/Joins in Python.ipynb
+++ b/inclass/Joins in Python.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Joins\n",
+    "A JOIN operation is used to combine rows from two or more tables based on related data shared in them. Let's overview some of the practical details of these opeerations.\n",
+    "\n",
+    "## Pandas Merge\n",
+    "The pandas package implements efficient \"equality\" joins. This function is called `merge` (pandas also has a `join` function which behaves slightly differently but similar idea!). Let's think of a simple example:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>name</th>\n",
+       "      <th>category</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>John Doe</td>\n",
+       "      <td>A</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>Jane Smith</td>\n",
+       "      <td>B</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>Alex Taylor</td>\n",
+       "      <td>A</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>Brett Daniels</td>\n",
+       "      <td>C</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "            name category\n",
+       "0       John Doe        A\n",
+       "1     Jane Smith        B\n",
+       "2    Alex Taylor        A\n",
+       "3  Brett Daniels        C"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import pandas as pd \n",
+    "\n",
+    "table1 = [{'name': 'John Doe', 'category': 'A'}, \n",
+    "          {'name': 'Jane Smith', 'category': 'B'}, \n",
+    "          {'name': 'Alex Taylor', 'category': 'A'},\n",
+    "          {'name': 'Brett Daniels', 'category': 'C'}]\n",
+    "\n",
+    "table1_df = pd.DataFrame(table1)\n",
+    "table1_df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>salary</th>\n",
+       "      <th>category</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>1000</td>\n",
+       "      <td>A</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>900</td>\n",
+       "      <td>B</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>500</td>\n",
+       "      <td>C</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   salary category\n",
+       "0    1000        A\n",
+       "1     900        B\n",
+       "2     500        C"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "table2 = [{'salary': 1000, 'category': 'A'}, \n",
+    "          {'salary': 900, 'category': 'B'}, \n",
+    "          {'salary': 500, 'category': 'C'}]\n",
+    "table2_df = pd.DataFrame(table2)\n",
+    "table2_df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>name</th>\n",
+       "      <th>category</th>\n",
+       "      <th>salary</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>John Doe</td>\n",
+       "      <td>A</td>\n",
+       "      <td>1000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>Alex Taylor</td>\n",
+       "      <td>A</td>\n",
+       "      <td>1000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>Jane Smith</td>\n",
+       "      <td>B</td>\n",
+       "      <td>900</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>Brett Daniels</td>\n",
+       "      <td>C</td>\n",
+       "      <td>500</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "            name category  salary\n",
+       "0       John Doe        A    1000\n",
+       "1    Alex Taylor        A    1000\n",
+       "2     Jane Smith        B     900\n",
+       "3  Brett Daniels        C     500"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "table1_df.merge(table2_df, on='category')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This function is commutative:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>salary</th>\n",
+       "      <th>category</th>\n",
+       "      <th>name</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>1000</td>\n",
+       "      <td>A</td>\n",
+       "      <td>John Doe</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>1000</td>\n",
+       "      <td>A</td>\n",
+       "      <td>Alex Taylor</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>900</td>\n",
+       "      <td>B</td>\n",
+       "      <td>Jane Smith</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>500</td>\n",
+       "      <td>C</td>\n",
+       "      <td>Brett Daniels</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   salary category           name\n",
+       "0    1000        A       John Doe\n",
+       "1    1000        A    Alex Taylor\n",
+       "2     900        B     Jane Smith\n",
+       "3     500        C  Brett Daniels"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "table2_df.merge(table1_df, on='category')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This function merges the two tables together on the category column and automatically removes the redundancy (1 single column is left). The behavior of this function can be subtle. Suppose, we change the category field to D for one of the rows:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>name</th>\n",
+       "      <th>category</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>John Doe</td>\n",
+       "      <td>A</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>Jane Smith</td>\n",
+       "      <td>B</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>Alex Taylor</td>\n",
+       "      <td>D</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>Brett Daniels</td>\n",
+       "      <td>C</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "            name category\n",
+       "0       John Doe        A\n",
+       "1     Jane Smith        B\n",
+       "2    Alex Taylor        D\n",
+       "3  Brett Daniels        C"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "table1 = [{'name': 'John Doe', 'category': 'A'}, \n",
+    "          {'name': 'Jane Smith', 'category': 'B'}, \n",
+    "          {'name': 'Alex Taylor', 'category': 'D'},\n",
+    "          {'name': 'Brett Daniels', 'category': 'C'}]\n",
+    "\n",
+    "table1_df = pd.DataFrame(table1)\n",
+    "table1_df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>name</th>\n",
+       "      <th>category</th>\n",
+       "      <th>salary</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>John Doe</td>\n",
+       "      <td>A</td>\n",
+       "      <td>1000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>Jane Smith</td>\n",
+       "      <td>B</td>\n",
+       "      <td>900</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>Brett Daniels</td>\n",
+       "      <td>C</td>\n",
+       "      <td>500</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "            name category  salary\n",
+       "0       John Doe        A    1000\n",
+       "1     Jane Smith        B     900\n",
+       "2  Brett Daniels        C     500"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "table1_df.merge(table2_df, on='category')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The row gets dropped from the result! In the basic operating mode of the merge command any row that doesn't have a match gets dropped. There is a key word `how` that can modify this behavior. Suppose, we want the left rows that don't match:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>name</th>\n",
+       "      <th>category</th>\n",
+       "      <th>salary</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>John Doe</td>\n",
+       "      <td>A</td>\n",
+       "      <td>1000.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>Jane Smith</td>\n",
+       "      <td>B</td>\n",
+       "      <td>900.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>Alex Taylor</td>\n",
+       "      <td>D</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>Brett Daniels</td>\n",
+       "      <td>C</td>\n",
+       "      <td>500.0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "            name category  salary\n",
+       "0       John Doe        A  1000.0\n",
+       "1     Jane Smith        B   900.0\n",
+       "2    Alex Taylor        D     NaN\n",
+       "3  Brett Daniels        C   500.0"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "table1_df.merge(table2_df, on='category', how='left')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "It returns those rows but with any additional columns null or nan, depending on the data type. If you set how to right you'll get the same answer as before (why?)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>name</th>\n",
+       "      <th>category</th>\n",
+       "      <th>salary</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>John Doe</td>\n",
+       "      <td>A</td>\n",
+       "      <td>1000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>Jane Smith</td>\n",
+       "      <td>B</td>\n",
+       "      <td>900</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>Brett Daniels</td>\n",
+       "      <td>C</td>\n",
+       "      <td>500</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "            name category  salary\n",
+       "0       John Doe        A    1000\n",
+       "1     Jane Smith        B     900\n",
+       "2  Brett Daniels        C     500"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "table1_df.merge(table2_df, on='category', how='right')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## General Join in Pandas\n",
+    "Pandas has an efficient implementation for equality join problems. Let's mock up a general join algorithm (any filter condition). Let's ignore the efficiency problem for a bit."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>name_x</th>\n",
+       "      <th>category_x</th>\n",
+       "      <th>dummy</th>\n",
+       "      <th>name_y</th>\n",
+       "      <th>category_y</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>John Doe</td>\n",
+       "      <td>A</td>\n",
+       "      <td>1</td>\n",
+       "      <td>John Doe</td>\n",
+       "      <td>A</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>John Doe</td>\n",
+       "      <td>A</td>\n",
+       "      <td>1</td>\n",
+       "      <td>Jane Smith</td>\n",
+       "      <td>B</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>John Doe</td>\n",
+       "      <td>A</td>\n",
+       "      <td>1</td>\n",
+       "      <td>Alex Taylor</td>\n",
+       "      <td>D</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>John Doe</td>\n",
+       "      <td>A</td>\n",
+       "      <td>1</td>\n",
+       "      <td>Brett Daniels</td>\n",
+       "      <td>C</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>Jane Smith</td>\n",
+       "      <td>B</td>\n",
+       "      <td>1</td>\n",
+       "      <td>John Doe</td>\n",
+       "      <td>A</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>Jane Smith</td>\n",
+       "      <td>B</td>\n",
+       "      <td>1</td>\n",
+       "      <td>Jane Smith</td>\n",
+       "      <td>B</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>Jane Smith</td>\n",
+       "      <td>B</td>\n",
+       "      <td>1</td>\n",
+       "      <td>Alex Taylor</td>\n",
+       "      <td>D</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>Jane Smith</td>\n",
+       "      <td>B</td>\n",
+       "      <td>1</td>\n",
+       "      <td>Brett Daniels</td>\n",
+       "      <td>C</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>Alex Taylor</td>\n",
+       "      <td>D</td>\n",
+       "      <td>1</td>\n",
+       "      <td>John Doe</td>\n",
+       "      <td>A</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>Alex Taylor</td>\n",
+       "      <td>D</td>\n",
+       "      <td>1</td>\n",
+       "      <td>Jane Smith</td>\n",
+       "      <td>B</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10</th>\n",
+       "      <td>Alex Taylor</td>\n",
+       "      <td>D</td>\n",
+       "      <td>1</td>\n",
+       "      <td>Alex Taylor</td>\n",
+       "      <td>D</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>11</th>\n",
+       "      <td>Alex Taylor</td>\n",
+       "      <td>D</td>\n",
+       "      <td>1</td>\n",
+       "      <td>Brett Daniels</td>\n",
+       "      <td>C</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>12</th>\n",
+       "      <td>Brett Daniels</td>\n",
+       "      <td>C</td>\n",
+       "      <td>1</td>\n",
+       "      <td>John Doe</td>\n",
+       "      <td>A</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>13</th>\n",
+       "      <td>Brett Daniels</td>\n",
+       "      <td>C</td>\n",
+       "      <td>1</td>\n",
+       "      <td>Jane Smith</td>\n",
+       "      <td>B</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>14</th>\n",
+       "      <td>Brett Daniels</td>\n",
+       "      <td>C</td>\n",
+       "      <td>1</td>\n",
+       "      <td>Alex Taylor</td>\n",
+       "      <td>D</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>15</th>\n",
+       "      <td>Brett Daniels</td>\n",
+       "      <td>C</td>\n",
+       "      <td>1</td>\n",
+       "      <td>Brett Daniels</td>\n",
+       "      <td>C</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "           name_x category_x  dummy         name_y category_y\n",
+       "0        John Doe          A      1       John Doe          A\n",
+       "1        John Doe          A      1     Jane Smith          B\n",
+       "2        John Doe          A      1    Alex Taylor          D\n",
+       "3        John Doe          A      1  Brett Daniels          C\n",
+       "4      Jane Smith          B      1       John Doe          A\n",
+       "5      Jane Smith          B      1     Jane Smith          B\n",
+       "6      Jane Smith          B      1    Alex Taylor          D\n",
+       "7      Jane Smith          B      1  Brett Daniels          C\n",
+       "8     Alex Taylor          D      1       John Doe          A\n",
+       "9     Alex Taylor          D      1     Jane Smith          B\n",
+       "10    Alex Taylor          D      1    Alex Taylor          D\n",
+       "11    Alex Taylor          D      1  Brett Daniels          C\n",
+       "12  Brett Daniels          C      1       John Doe          A\n",
+       "13  Brett Daniels          C      1     Jane Smith          B\n",
+       "14  Brett Daniels          C      1    Alex Taylor          D\n",
+       "15  Brett Daniels          C      1  Brett Daniels          C"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "def all_pairs(df):\n",
+    "    new_df = df.copy() # make a copy of the data frame\n",
+    "    new_df['dummy'] = 1\n",
+    "    \n",
+    "    return new_df.merge(new_df, on='dummy')\n",
+    "\n",
+    "all_pairs(table1_df)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>name_x</th>\n",
+       "      <th>category_x</th>\n",
+       "      <th>dummy</th>\n",
+       "      <th>name_y</th>\n",
+       "      <th>category_y</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>John Doe</td>\n",
+       "      <td>A</td>\n",
+       "      <td>1</td>\n",
+       "      <td>Jane Smith</td>\n",
+       "      <td>B</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>John Doe</td>\n",
+       "      <td>A</td>\n",
+       "      <td>1</td>\n",
+       "      <td>Alex Taylor</td>\n",
+       "      <td>D</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>John Doe</td>\n",
+       "      <td>A</td>\n",
+       "      <td>1</td>\n",
+       "      <td>Brett Daniels</td>\n",
+       "      <td>C</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>Jane Smith</td>\n",
+       "      <td>B</td>\n",
+       "      <td>1</td>\n",
+       "      <td>Alex Taylor</td>\n",
+       "      <td>D</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>Jane Smith</td>\n",
+       "      <td>B</td>\n",
+       "      <td>1</td>\n",
+       "      <td>Brett Daniels</td>\n",
+       "      <td>C</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>14</th>\n",
+       "      <td>Brett Daniels</td>\n",
+       "      <td>C</td>\n",
+       "      <td>1</td>\n",
+       "      <td>Alex Taylor</td>\n",
+       "      <td>D</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "           name_x category_x  dummy         name_y category_y\n",
+       "1        John Doe          A      1     Jane Smith          B\n",
+       "2        John Doe          A      1    Alex Taylor          D\n",
+       "3        John Doe          A      1  Brett Daniels          C\n",
+       "6      Jane Smith          B      1    Alex Taylor          D\n",
+       "7      Jane Smith          B      1  Brett Daniels          C\n",
+       "14  Brett Daniels          C      1    Alex Taylor          D"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pair_df = all_pairs(table1_df)\n",
+    "pair_df[pair_df['name_x'] >  pair_df['name_y']]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Self-Join\n",
+    "There are even scenarios when you might want to join a table with itself! Consider the following example."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 80,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>name</th>\n",
+       "      <th>rank</th>\n",
+       "      <th>salary</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>John Doe</td>\n",
+       "      <td>Manager</td>\n",
+       "      <td>100</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>Jane Smith</td>\n",
+       "      <td>Manager</td>\n",
+       "      <td>55</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>Alex Taylor</td>\n",
+       "      <td>Employee</td>\n",
+       "      <td>32</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>Brett Daniels</td>\n",
+       "      <td>Employee</td>\n",
+       "      <td>57</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "            name      rank  salary\n",
+       "0       John Doe   Manager     100\n",
+       "1     Jane Smith   Manager      55\n",
+       "2    Alex Taylor  Employee      32\n",
+       "3  Brett Daniels  Employee      57"
+      ]
+     },
+     "execution_count": 80,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "emps = [{'name': 'John Doe', 'rank': 'Manager', 'salary': 100}, \n",
+    "          {'name': 'Jane Smith', 'rank': 'Manager', 'salary': 55}, \n",
+    "          {'name': 'Alex Taylor', 'rank': 'Employee', 'salary': 32},\n",
+    "          {'name': 'Brett Daniels', 'rank': 'Employee', 'salary': 57}]\n",
+    "emps_df = pd.DataFrame(emps)\n",
+    "emps_df"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Suppose, we have the integrity constraint: no manager can earn less than an employee."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 87,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>name_x</th>\n",
+       "      <th>rank_x</th>\n",
+       "      <th>salary_x</th>\n",
+       "      <th>dummy</th>\n",
+       "      <th>name_y</th>\n",
+       "      <th>rank_y</th>\n",
+       "      <th>salary_y</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>Jane Smith</td>\n",
+       "      <td>Manager</td>\n",
+       "      <td>55</td>\n",
+       "      <td>1</td>\n",
+       "      <td>Brett Daniels</td>\n",
+       "      <td>Employee</td>\n",
+       "      <td>57</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "       name_x   rank_x  salary_x  dummy         name_y    rank_y  salary_y\n",
+       "7  Jane Smith  Manager        55      1  Brett Daniels  Employee        57"
+      ]
+     },
+     "execution_count": 87,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pair_df = all_pairs(emps_df)\n",
+    "pair_df[(pair_df['rank_x'] == 'Manager') &(pair_df['rank_y'] == 'Employee') & (pair_df['salary_x'] < pair_df['salary_y'])]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}