first cut at adding data augmentation python notebook
Browse files- augment_data.ipynb +376 -0
- train-data/sql_train.tsv +3 -3
    	
        augment_data.ipynb
    ADDED
    
    | @@ -0,0 +1,376 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
             "cells": [
         | 
| 3 | 
            +
              {
         | 
| 4 | 
            +
               "cell_type": "markdown",
         | 
| 5 | 
            +
               "metadata": {},
         | 
| 6 | 
            +
               "source": [
         | 
| 7 | 
            +
                "# Augment data from TSV files to change team names and years"
         | 
| 8 | 
            +
               ]
         | 
| 9 | 
            +
              },
         | 
| 10 | 
            +
              {
         | 
| 11 | 
            +
               "cell_type": "markdown",
         | 
| 12 | 
            +
               "metadata": {},
         | 
| 13 | 
            +
               "source": [
         | 
| 14 | 
            +
                "## Create dictionary for mapping team names and abbreviations"
         | 
| 15 | 
            +
               ]
         | 
| 16 | 
            +
              },
         | 
| 17 | 
            +
              {
         | 
| 18 | 
            +
               "cell_type": "code",
         | 
| 19 | 
            +
               "execution_count": 3,
         | 
| 20 | 
            +
               "metadata": {},
         | 
| 21 | 
            +
               "outputs": [
         | 
| 22 | 
            +
                {
         | 
| 23 | 
            +
                 "name": "stdout",
         | 
| 24 | 
            +
                 "output_type": "stream",
         | 
| 25 | 
            +
                 "text": [
         | 
| 26 | 
            +
                  "30\n",
         | 
| 27 | 
            +
                  "30\n",
         | 
| 28 | 
            +
                  "30\n",
         | 
| 29 | 
            +
                  "30\n",
         | 
| 30 | 
            +
                  "0,Atlanta Hawks,0,ATL,0\n",
         | 
| 31 | 
            +
                  "1,Boston Celtics,1,BOS,1\n",
         | 
| 32 | 
            +
                  "2,Cleveland Cavaliers,2,CLE,2\n",
         | 
| 33 | 
            +
                  "3,New Orleans Pelicans,3,NOP,3\n",
         | 
| 34 | 
            +
                  "4,Chicago Bulls,4,CHI,4\n",
         | 
| 35 | 
            +
                  "5,Dallas Mavericks,5,DAL,5\n",
         | 
| 36 | 
            +
                  "6,Denver Nuggets,6,DEN,6\n",
         | 
| 37 | 
            +
                  "7,Golden State Warriors,7,GSW,7\n",
         | 
| 38 | 
            +
                  "8,Houston Rockets,8,HOU,8\n",
         | 
| 39 | 
            +
                  "9,Los Angeles Clippers,9,LAC,9\n",
         | 
| 40 | 
            +
                  "10,Los Angeles Lakers,10,LAL,10\n",
         | 
| 41 | 
            +
                  "11,Miami Heat,11,MIA,11\n",
         | 
| 42 | 
            +
                  "12,Milwaukee Bucks,12,MIL,12\n",
         | 
| 43 | 
            +
                  "13,Minnesota Timberwolves,13,MIN,13\n",
         | 
| 44 | 
            +
                  "14,Brooklyn Nets,14,BKN,14\n",
         | 
| 45 | 
            +
                  "15,New York Knicks,15,NYK,15\n",
         | 
| 46 | 
            +
                  "16,Orlando Magic,16,ORL,16\n",
         | 
| 47 | 
            +
                  "17,Indiana Pacers,17,IND,17\n",
         | 
| 48 | 
            +
                  "18,Philadelphia 76ers,18,PHI,18\n",
         | 
| 49 | 
            +
                  "19,Phoenix Suns,19,PHX,19\n",
         | 
| 50 | 
            +
                  "20,Portland Trail Blazers,20,POR,20\n",
         | 
| 51 | 
            +
                  "21,Sacramento Kings,21,SAC,21\n",
         | 
| 52 | 
            +
                  "22,San Antonio Spurs,22,SAS,22\n",
         | 
| 53 | 
            +
                  "23,Oklahoma City Thunder,23,OKC,23\n",
         | 
| 54 | 
            +
                  "24,Toronto Raptors,24,TOR,24\n",
         | 
| 55 | 
            +
                  "25,Utah Jazz,25,UTA,25\n",
         | 
| 56 | 
            +
                  "26,Memphis Grizzlies,26,MEM,26\n",
         | 
| 57 | 
            +
                  "27,Washington Wizards,27,WAS,27\n",
         | 
| 58 | 
            +
                  "28,Detroit Pistons,28,DET,28\n",
         | 
| 59 | 
            +
                  "29,Charlotte Hornets,29,CHA,29\n"
         | 
| 60 | 
            +
                 ]
         | 
| 61 | 
            +
                }
         | 
| 62 | 
            +
               ],
         | 
| 63 | 
            +
               "source": [
         | 
| 64 | 
            +
                "# Create team map and team array\n",
         | 
| 65 | 
            +
                "team_map = {\n",
         | 
| 66 | 
            +
                "    \"Atlanta Hawks\": 0,\n",
         | 
| 67 | 
            +
                "    \"Boston Celtics\": 1,\n",
         | 
| 68 | 
            +
                "    \"Cleveland Cavaliers\": 2,\n",
         | 
| 69 | 
            +
                "    \"New Orleans Pelicans\": 3,\n",
         | 
| 70 | 
            +
                "    \"Chicago Bulls\": 4,\n",
         | 
| 71 | 
            +
                "    \"Dallas Mavericks\": 5,\n",
         | 
| 72 | 
            +
                "    \"Denver Nuggets\": 6,\n",
         | 
| 73 | 
            +
                "    \"Golden State Warriors\": 7,\n",
         | 
| 74 | 
            +
                "    \"Houston Rockets\": 8,\n",
         | 
| 75 | 
            +
                "    \"Los Angeles Clippers\": 9,\n",
         | 
| 76 | 
            +
                "    \"Los Angeles Lakers\": 10,\n",
         | 
| 77 | 
            +
                "    \"Miami Heat\": 11,\n",
         | 
| 78 | 
            +
                "    \"Milwaukee Bucks\": 12,\n",
         | 
| 79 | 
            +
                "    \"Minnesota Timberwolves\": 13,\n",
         | 
| 80 | 
            +
                "    \"Brooklyn Nets\": 14,\n",
         | 
| 81 | 
            +
                "    \"New York Knicks\": 15,\n",
         | 
| 82 | 
            +
                "    \"Orlando Magic\": 16,\n",
         | 
| 83 | 
            +
                "    \"Indiana Pacers\": 17,\n",
         | 
| 84 | 
            +
                "    \"Philadelphia 76ers\": 18,\n",
         | 
| 85 | 
            +
                "    \"Phoenix Suns\": 19,\n",
         | 
| 86 | 
            +
                "    \"Portland Trail Blazers\": 20,\n",
         | 
| 87 | 
            +
                "    \"Sacramento Kings\": 21,\n",
         | 
| 88 | 
            +
                "    \"San Antonio Spurs\": 22,\n",
         | 
| 89 | 
            +
                "    \"Oklahoma City Thunder\": 23,\n",
         | 
| 90 | 
            +
                "    \"Toronto Raptors\": 24,\n",
         | 
| 91 | 
            +
                "    \"Utah Jazz\": 25,\n",
         | 
| 92 | 
            +
                "    \"Memphis Grizzlies\": 26,\n",
         | 
| 93 | 
            +
                "    \"Washington Wizards\": 27,\n",
         | 
| 94 | 
            +
                "    \"Detroit Pistons\": 28,\n",
         | 
| 95 | 
            +
                "    \"Charlotte Hornets\": 29\n",
         | 
| 96 | 
            +
                "}\n",
         | 
| 97 | 
            +
                "\n",
         | 
| 98 | 
            +
                "team_array = [\n",
         | 
| 99 | 
            +
                "\"Atlanta Hawks\",\n",
         | 
| 100 | 
            +
                "\"Boston Celtics\",\n",
         | 
| 101 | 
            +
                "\"Cleveland Cavaliers\",\n",
         | 
| 102 | 
            +
                "\"New Orleans Pelicans\",\n",
         | 
| 103 | 
            +
                "\"Chicago Bulls\",\n",
         | 
| 104 | 
            +
                "\"Dallas Mavericks\",\n",
         | 
| 105 | 
            +
                "\"Denver Nuggets\",\n",
         | 
| 106 | 
            +
                "\"Golden State Warriors\",\n",
         | 
| 107 | 
            +
                "\"Houston Rockets\",\n",
         | 
| 108 | 
            +
                "\"Los Angeles Clippers\",\n",
         | 
| 109 | 
            +
                "\"Los Angeles Lakers\",\n",
         | 
| 110 | 
            +
                "\"Miami Heat\",\n",
         | 
| 111 | 
            +
                "\"Milwaukee Bucks\",\n",
         | 
| 112 | 
            +
                "\"Minnesota Timberwolves\",\n",
         | 
| 113 | 
            +
                "\"Brooklyn Nets\",\n",
         | 
| 114 | 
            +
                "\"New York Knicks\",\n",
         | 
| 115 | 
            +
                "\"Orlando Magic\",\n",
         | 
| 116 | 
            +
                "\"Indiana Pacers\",\n",
         | 
| 117 | 
            +
                "\"Philadelphia 76ers\",\n",
         | 
| 118 | 
            +
                "\"Phoenix Suns\",\n",
         | 
| 119 | 
            +
                "\"Portland Trail Blazers\",\n",
         | 
| 120 | 
            +
                "\"Sacramento Kings\",\n",
         | 
| 121 | 
            +
                "\"San Antonio Spurs\",\n",
         | 
| 122 | 
            +
                "\"Oklahoma City Thunder\",\n",
         | 
| 123 | 
            +
                "\"Toronto Raptors\",\n",
         | 
| 124 | 
            +
                "\"Utah Jazz\",\n",
         | 
| 125 | 
            +
                "\"Memphis Grizzlies\",\n",
         | 
| 126 | 
            +
                "\"Washington Wizards\",\n",
         | 
| 127 | 
            +
                "\"Detroit Pistons\",\n",
         | 
| 128 | 
            +
                "\"Charlotte Hornets\"]\n",
         | 
| 129 | 
            +
                "\n",
         | 
| 130 | 
            +
                "# Check that array and dictionary are aligned properly\n",
         | 
| 131 | 
            +
                "for i in range(len(team_array)):\n",
         | 
| 132 | 
            +
                "    if i != team_map[team_array[i]]:\n",
         | 
| 133 | 
            +
                "        print(\"Invalid!\")\n",
         | 
| 134 | 
            +
                "\n",
         | 
| 135 | 
            +
                "# Create abbreviation map and array\n",
         | 
| 136 | 
            +
                "abbreviation_array = [\n",
         | 
| 137 | 
            +
                "\"ATL\",\n",
         | 
| 138 | 
            +
                "\"BOS\",\n",
         | 
| 139 | 
            +
                "\"CLE\",\n",
         | 
| 140 | 
            +
                "\"NOP\",\n",
         | 
| 141 | 
            +
                "\"CHI\",\n",
         | 
| 142 | 
            +
                "\"DAL\",\n",
         | 
| 143 | 
            +
                "\"DEN\",\n",
         | 
| 144 | 
            +
                "\"GSW\",\n",
         | 
| 145 | 
            +
                "\"HOU\",\n",
         | 
| 146 | 
            +
                "\"LAC\",\n",
         | 
| 147 | 
            +
                "\"LAL\",\n",
         | 
| 148 | 
            +
                "\"MIA\",\n",
         | 
| 149 | 
            +
                "\"MIL\",\n",
         | 
| 150 | 
            +
                "\"MIN\",\n",
         | 
| 151 | 
            +
                "\"BKN\",\n",
         | 
| 152 | 
            +
                "\"NYK\",\n",
         | 
| 153 | 
            +
                "\"ORL\",\n",
         | 
| 154 | 
            +
                "\"IND\",\n",
         | 
| 155 | 
            +
                "\"PHI\",\n",
         | 
| 156 | 
            +
                "\"PHX\",\n",
         | 
| 157 | 
            +
                "\"POR\",\n",
         | 
| 158 | 
            +
                "\"SAC\",\n",
         | 
| 159 | 
            +
                "\"SAS\",\n",
         | 
| 160 | 
            +
                "\"OKC\",\n",
         | 
| 161 | 
            +
                "\"TOR\",\n",
         | 
| 162 | 
            +
                "\"UTA\",\n",
         | 
| 163 | 
            +
                "\"MEM\",\n",
         | 
| 164 | 
            +
                "\"WAS\",\n",
         | 
| 165 | 
            +
                "\"DET\",\n",
         | 
| 166 | 
            +
                "\"CHA\"]\n",
         | 
| 167 | 
            +
                "\n",
         | 
| 168 | 
            +
                "abbreviation_map = {}\n",
         | 
| 169 | 
            +
                "for i in range(len(abbreviation_array)):\n",
         | 
| 170 | 
            +
                "    abbreviation_map[abbreviation_array[i]] = i\n",
         | 
| 171 | 
            +
                "\n",
         | 
| 172 | 
            +
                "print(len(team_array))\n",
         | 
| 173 | 
            +
                "print(len(team_map))\n",
         | 
| 174 | 
            +
                "print(len(abbreviation_array))\n",
         | 
| 175 | 
            +
                "print(len(abbreviation_map))\n",
         | 
| 176 | 
            +
                "\n",
         | 
| 177 | 
            +
                "# Check that team names, abbreviation and index maps all line up\n",
         | 
| 178 | 
            +
                "for i in range(len(team_array)):\n",
         | 
| 179 | 
            +
                "    print(str(i) + \",\" + team_array[i] + \",\" + str(team_map[team_array[i]]) + \",\" + abbreviation_array[i] + \",\" + str(abbreviation_map[abbreviation_array[i]]))\n",
         | 
| 180 | 
            +
                " "
         | 
| 181 | 
            +
               ]
         | 
| 182 | 
            +
              },
         | 
| 183 | 
            +
              {
         | 
| 184 | 
            +
               "cell_type": "markdown",
         | 
| 185 | 
            +
               "metadata": {},
         | 
| 186 | 
            +
               "source": [
         | 
| 187 | 
            +
                "## Create function to augment data by updating team names in SQL queries"
         | 
| 188 | 
            +
               ]
         | 
| 189 | 
            +
              },
         | 
| 190 | 
            +
              {
         | 
| 191 | 
            +
               "cell_type": "code",
         | 
| 192 | 
            +
               "execution_count": 44,
         | 
| 193 | 
            +
               "metadata": {},
         | 
| 194 | 
            +
               "outputs": [],
         | 
| 195 | 
            +
               "source": [
         | 
| 196 | 
            +
                "import random\n",
         | 
| 197 | 
            +
                "import pandas as pd\n",
         | 
| 198 | 
            +
                "import sqlite3 as sql\n",
         | 
| 199 | 
            +
                "\n",
         | 
| 200 | 
            +
                "# Find team names in the sample\n",
         | 
| 201 | 
            +
                "def find_teams_in_sample(sample, team_list):\n",
         | 
| 202 | 
            +
                "    result = []\n",
         | 
| 203 | 
            +
                "    for i in range(len(team_list)):\n",
         | 
| 204 | 
            +
                "        if team_list[i] in sample:\n",
         | 
| 205 | 
            +
                "            result.append(i)\n",
         | 
| 206 | 
            +
                "    return result\n",
         | 
| 207 | 
            +
                "\n",
         | 
| 208 | 
            +
                "# Get random number excluding the one already used\n",
         | 
| 209 | 
            +
                "def get_random_excluding(floor, ceiling, excluded_number):\n",
         | 
| 210 | 
            +
                "    number = random.randint(floor, ceiling)\n",
         | 
| 211 | 
            +
                "    while number == excluded_number:\n",
         | 
| 212 | 
            +
                "        number = random.randint(floor, ceiling)\n",
         | 
| 213 | 
            +
                "    return number\n",
         | 
| 214 | 
            +
                "\n",
         | 
| 215 | 
            +
                "def augment_dataframe(df, team_list, abbreviation_list, database):\n",
         | 
| 216 | 
            +
                "    augmented_df = df.copy()\n",
         | 
| 217 | 
            +
                "    for _, row in df.iterrows():\n",
         | 
| 218 | 
            +
                "        team_idx = find_teams_in_sample(row[\"natural_query\"], team_list)\n",
         | 
| 219 | 
            +
                "        # Only do simple update if only one team detected \n",
         | 
| 220 | 
            +
                "        if len(team_idx) == 1:\n",
         | 
| 221 | 
            +
                "            team_idx = team_idx[0]\n",
         | 
| 222 | 
            +
                "\n",
         | 
| 223 | 
            +
                "            # Check if team name is used in SQL query\n",
         | 
| 224 | 
            +
                "            if team_list[team_idx] in row[\"sql_query\"]:\n",
         | 
| 225 | 
            +
                "                # Create updated query with new team \n",
         | 
| 226 | 
            +
                "                new_team_name = team_list[get_random_excluding(0, 29, team_idx)]\n",
         | 
| 227 | 
            +
                "                new_natural_query = row[\"natural_query\"].replace(team_list[team_idx], new_team_name)\n",
         | 
| 228 | 
            +
                "                new_sql_query = row[\"sql_query\"].replace(team_list[team_idx], new_team_name)\n",
         | 
| 229 | 
            +
                "\n",
         | 
| 230 | 
            +
                "                # Obtain result of running on sqlite database\n",
         | 
| 231 | 
            +
                "                try:\n",
         | 
| 232 | 
            +
                "                    database.execute(new_sql_query)\n",
         | 
| 233 | 
            +
                "                    rows = database.fetchall()\n",
         | 
| 234 | 
            +
                "                    if len(rows) == 1:\n",
         | 
| 235 | 
            +
                "                        if len(rows[0]) == 2 and rows[0][1] == None:\n",
         | 
| 236 | 
            +
                "                            result = str(rows[0][0])\n",
         | 
| 237 | 
            +
                "                        else:\n",
         | 
| 238 | 
            +
                "                            result = \" | \".join(str(x) for x in rows[0]) \n",
         | 
| 239 | 
            +
                "                    else:\n",
         | 
| 240 | 
            +
                "                        result = \" | \".join(str(x) for x in rows)\n",
         | 
| 241 | 
            +
                "                    # Append new row to augmented dataframe if result successful\n",
         | 
| 242 | 
            +
                "                    new_row = pd.DataFrame([{'natural_query': new_natural_query, 'sql_query': new_sql_query, 'result': result}])\n",
         | 
| 243 | 
            +
                "                    augmented_df = pd.concat([augmented_df, new_row], ignore_index=True)\n",
         | 
| 244 | 
            +
                "                except:\n",
         | 
| 245 | 
            +
                "                    pass\n",
         | 
| 246 | 
            +
                "\n",
         | 
| 247 | 
            +
                "            # Check if abbreviation is in SQL query used instead\n",
         | 
| 248 | 
            +
                "            elif abbreviation_list[team_idx] in row[\"sql_query\"]:\n",
         | 
| 249 | 
            +
                "                # Create updated query with new team \n",
         | 
| 250 | 
            +
                "                new_index = get_random_excluding(0, 29, team_idx)\n",
         | 
| 251 | 
            +
                "                new_team_name = team_list[new_index]\n",
         | 
| 252 | 
            +
                "                new_team_abbreviation = abbreviation_list[new_index]\n",
         | 
| 253 | 
            +
                "                new_natural_query = row[\"natural_query\"].replace(team_list[team_idx], new_team_name)\n",
         | 
| 254 | 
            +
                "                new_sql_query = row[\"sql_query\"].replace(abbreviation_list[team_idx], new_team_abbreviation)\n",
         | 
| 255 | 
            +
                "\n",
         | 
| 256 | 
            +
                "                # Obtain result of running on sqlite database\n",
         | 
| 257 | 
            +
                "                try:\n",
         | 
| 258 | 
            +
                "                    database.execute(new_sql_query)\n",
         | 
| 259 | 
            +
                "                    rows = database.fetchall()\n",
         | 
| 260 | 
            +
                "                    if len(rows) == 1:\n",
         | 
| 261 | 
            +
                "                        \n",
         | 
| 262 | 
            +
                "                        if len(rows[0]) == 2 and rows[0][1] == None:\n",
         | 
| 263 | 
            +
                "                            result = str(rows[0][0])\n",
         | 
| 264 | 
            +
                "                        else:\n",
         | 
| 265 | 
            +
                "                            result = \" | \".join(str(x) for x in rows[0]) \n",
         | 
| 266 | 
            +
                "                    else:\n",
         | 
| 267 | 
            +
                "                        result = \" | \".join(str(x) for x in rows)\n",
         | 
| 268 | 
            +
                "                    # Append new row to augmented dataframe if result successful\n",
         | 
| 269 | 
            +
                "                    new_row = pd.DataFrame([{'natural_query': new_natural_query, 'sql_query': new_sql_query, 'result': result}])\n",
         | 
| 270 | 
            +
                "                    augmented_df = pd.concat([augmented_df, new_row], ignore_index=True)\n",
         | 
| 271 | 
            +
                "                except:\n",
         | 
| 272 | 
            +
                "                    pass\n",
         | 
| 273 | 
            +
                "    return augmented_df\n"
         | 
| 274 | 
            +
               ]
         | 
| 275 | 
            +
              },
         | 
| 276 | 
            +
              {
         | 
| 277 | 
            +
               "cell_type": "markdown",
         | 
| 278 | 
            +
               "metadata": {},
         | 
| 279 | 
            +
               "source": [
         | 
| 280 | 
            +
                "## Test functions on small dataframe sample"
         | 
| 281 | 
            +
               ]
         | 
| 282 | 
            +
              },
         | 
| 283 | 
            +
              {
         | 
| 284 | 
            +
               "cell_type": "code",
         | 
| 285 | 
            +
               "execution_count": 47,
         | 
| 286 | 
            +
               "metadata": {},
         | 
| 287 | 
            +
               "outputs": [
         | 
| 288 | 
            +
                {
         | 
| 289 | 
            +
                 "name": "stdout",
         | 
| 290 | 
            +
                 "output_type": "stream",
         | 
| 291 | 
            +
                 "text": [
         | 
| 292 | 
            +
                  "Total dataset examples: 1044\n",
         | 
| 293 | 
            +
                  "New Dataset Length:\n",
         | 
| 294 | 
            +
                  "6\n",
         | 
| 295 | 
            +
                  "\n",
         | 
| 296 | 
            +
                  "What is the total free throws made by the Indiana Pacers at home?\n",
         | 
| 297 | 
            +
                  "SELECT SUM(ftm_home) as total_ftm  FROM game  WHERE team_name_home = 'Indiana Pacers';\n",
         | 
| 298 | 
            +
                  "39545.0\n",
         | 
| 299 | 
            +
                  "\n",
         | 
| 300 | 
            +
                  "How many total rebounds did the Los Angeles Lakers grab in the 1985 season?\n",
         | 
| 301 | 
            +
                  "SELECT SUM(reb) AS total_rebounds FROM (     SELECT reb_home AS reb FROM game WHERE team_abbreviation_home = 'LAL' AND season_id = '21985'     UNION ALL     SELECT reb_away AS reb FROM game WHERE team_abbreviation_away = 'LAL' AND season_id = '21985' );\n",
         | 
| 302 | 
            +
                  "3655.0\n",
         | 
| 303 | 
            +
                  "\n",
         | 
| 304 | 
            +
                  "How many home games did the Orlando Magic play in the 2013 season?\n",
         | 
| 305 | 
            +
                  "SELECT COUNT(*) FROM game WHERE team_name_home = 'Orlando Magic' AND season_id = '22013';\n",
         | 
| 306 | 
            +
                  "41.0\n",
         | 
| 307 | 
            +
                  "\n",
         | 
| 308 | 
            +
                  "What is the total free throws made by the Denver Nuggets at home?\n",
         | 
| 309 | 
            +
                  "SELECT SUM(ftm_home) as total_ftm  FROM game  WHERE team_name_home = 'Denver Nuggets';\n",
         | 
| 310 | 
            +
                  "43821.0\n",
         | 
| 311 | 
            +
                  "\n",
         | 
| 312 | 
            +
                  "How many total rebounds did the Miami Heat grab in the 1985 season?\n",
         | 
| 313 | 
            +
                  "SELECT SUM(reb) AS total_rebounds FROM (     SELECT reb_home AS reb FROM game WHERE team_abbreviation_home = 'MIA' AND season_id = '21985'     UNION ALL     SELECT reb_away AS reb FROM game WHERE team_abbreviation_away = 'MIA' AND season_id = '21985' );\n",
         | 
| 314 | 
            +
                  "None\n",
         | 
| 315 | 
            +
                  "\n",
         | 
| 316 | 
            +
                  "How many home games did the Atlanta Hawks play in the 2013 season?\n",
         | 
| 317 | 
            +
                  "SELECT COUNT(*) FROM game WHERE team_name_home = 'Atlanta Hawks' AND season_id = '22013';\n",
         | 
| 318 | 
            +
                  "41\n",
         | 
| 319 | 
            +
                  "\n"
         | 
| 320 | 
            +
                 ]
         | 
| 321 | 
            +
                }
         | 
| 322 | 
            +
               ],
         | 
| 323 | 
            +
               "source": [
         | 
| 324 | 
            +
                "# Load dataset\n",
         | 
| 325 | 
            +
                "train_df = pd.read_csv(\"./train-data/sql_train.tsv\", sep='\\t')\n",
         | 
| 326 | 
            +
                "\n",
         | 
| 327 | 
            +
                "# Display dataset info\n",
         | 
| 328 | 
            +
                "print(f\"Total dataset examples: {len(train_df)}\")\n",
         | 
| 329 | 
            +
                "#print(train_df.head())\n",
         | 
| 330 | 
            +
                "\n",
         | 
| 331 | 
            +
                "# Setup sqlite database connection\n",
         | 
| 332 | 
            +
                "connection = sql.connect('./nba-data/nba.sqlite')\n",
         | 
| 333 | 
            +
                "cursor = connection.cursor()\n",
         | 
| 334 | 
            +
                "\n",
         | 
| 335 | 
            +
                "# Test augmentation on sample of 3 rows\n",
         | 
| 336 | 
            +
                "test_df = train_df.sample(n=3)\n",
         | 
| 337 | 
            +
                "#for _, row in test_df.iterrows():\n",
         | 
| 338 | 
            +
                "    #print(row)\n",
         | 
| 339 | 
            +
                "#print()\n",
         | 
| 340 | 
            +
                "#print()\n",
         | 
| 341 | 
            +
                "\n",
         | 
| 342 | 
            +
                "# Run augmentation function and print output\n",
         | 
| 343 | 
            +
                "augmented_df = augment_dataframe(test_df, team_array, abbreviation_array, cursor)\n",
         | 
| 344 | 
            +
                "print(\"New Dataset Length:\")\n",
         | 
| 345 | 
            +
                "print(len(augmented_df))\n",
         | 
| 346 | 
            +
                "print()\n",
         | 
| 347 | 
            +
                "for _, row in augmented_df.iterrows():\n",
         | 
| 348 | 
            +
                "    print(row[\"natural_query\"])\n",
         | 
| 349 | 
            +
                "    print(row[\"sql_query\"])\n",
         | 
| 350 | 
            +
                "    print(row[\"result\"])\n",
         | 
| 351 | 
            +
                "    print()"
         | 
| 352 | 
            +
               ]
         | 
| 353 | 
            +
              }
         | 
| 354 | 
            +
             ],
         | 
| 355 | 
            +
             "metadata": {
         | 
| 356 | 
            +
              "kernelspec": {
         | 
| 357 | 
            +
               "display_name": "Python 3",
         | 
| 358 | 
            +
               "language": "python",
         | 
| 359 | 
            +
               "name": "python3"
         | 
| 360 | 
            +
              },
         | 
| 361 | 
            +
              "language_info": {
         | 
| 362 | 
            +
               "codemirror_mode": {
         | 
| 363 | 
            +
                "name": "ipython",
         | 
| 364 | 
            +
                "version": 3
         | 
| 365 | 
            +
               },
         | 
| 366 | 
            +
               "file_extension": ".py",
         | 
| 367 | 
            +
               "mimetype": "text/x-python",
         | 
| 368 | 
            +
               "name": "python",
         | 
| 369 | 
            +
               "nbconvert_exporter": "python",
         | 
| 370 | 
            +
               "pygments_lexer": "ipython3",
         | 
| 371 | 
            +
               "version": "3.10.9"
         | 
| 372 | 
            +
              }
         | 
| 373 | 
            +
             },
         | 
| 374 | 
            +
             "nbformat": 4,
         | 
| 375 | 
            +
             "nbformat_minor": 2
         | 
| 376 | 
            +
            }
         | 
    	
        train-data/sql_train.tsv
    CHANGED
    
    | @@ -561,14 +561,14 @@ What is the highest combined pts in any game involving the Milwaukee Bucks?	SELE | |
| 561 | 
             
            How many home games did the Los Angeles Lakers play in the 2022 season?	SELECT COUNT(*) FROM game WHERE team_name_home = 'Los Angeles Lakers' AND season_id = '22022';	41.0
         | 
| 562 | 
             
            What is the highest combined pts in any game involving the Milwaukee Bucks?	SELECT MAX(pts_home + pts_away) FROM game WHERE team_name_home = 'Milwaukee Bucks' OR team_name_away = 'Milwaukee Bucks';	337.0
         | 
| 563 | 
             
            How many away games did the Chicago Bulls play in the 2020 season?	SELECT COUNT(*) FROM game WHERE team_name_away = 'Chicago Bulls' AND season_id = '22020';	36.0
         | 
| 564 | 
            -
            In which season did the Golden State Warriors have the highest average fg_pct at home?	SELECT season_id, AVG(fg_pct_home) as avg_stat FROM game WHERE team_name_home = 'Golden State Warriors' GROUP BY season_id ORDER BY avg_stat DESC LIMIT 1; | 
| 565 | 
             
            How many away games did the Chicago Bulls play in the 2001 season?	SELECT COUNT(*) FROM game WHERE team_name_away = 'Chicago Bulls' AND season_id = '22001';	41.0
         | 
| 566 | 
             
            What is the average number of fg_pct in away games by the Miami Heat?	SELECT AVG(fg_pct_away) FROM game WHERE team_name_away = 'Miami Heat';	0.4499279161205765
         | 
| 567 | 
             
            How many away games did the Chicago Bulls play in the 2002 season?	SELECT COUNT(*) FROM game WHERE team_name_away = 'Chicago Bulls' AND season_id = '22002';	41.0
         | 
| 568 | 
             
            What is the highest combined reb in any game involving the Los Angeles Clippers?	SELECT MAX(reb_home + reb_away) FROM game WHERE team_name_home = 'Los Angeles Clippers' OR team_name_away = 'Los Angeles Clippers';	134.0
         | 
| 569 | 
             
            How many home games did the Los Angeles Lakers play in the 2005 season?	SELECT COUNT(*) FROM game WHERE team_name_home = 'Los Angeles Lakers' AND season_id = '22005';	41.0
         | 
| 570 | 
             
            How many home games did the Los Angeles Lakers play in the 2003 season?	SELECT COUNT(*) FROM game WHERE team_name_home = 'Los Angeles Lakers' AND season_id = '22003';	41.0
         | 
| 571 | 
            -
            In which season did the Golden State Warriors have the highest average ast at home?	SELECT season_id, AVG(ast_home) as avg_stat FROM game WHERE team_name_home = 'Golden State Warriors' GROUP BY season_id ORDER BY avg_stat DESC LIMIT 1; | 
| 572 | 
             
            What is the average number of reb in away games by the Boston Celtics?	SELECT AVG(reb_away) FROM game WHERE team_name_away = 'Boston Celtics';	42.40882509303562
         | 
| 573 | 
             
            In which season did the Los Angeles Clippers have the highest average ast at home?	SELECT season_id, AVG(ast_home) as avg_stat FROM game WHERE team_name_home = 'Los Angeles Clippers' GROUP BY season_id ORDER BY avg_stat DESC LIMIT 1;	21988.0
         | 
| 574 | 
             
            What is the highest combined ast in any game involving the Houston Rockets?	SELECT MAX(ast_home + ast_away) FROM game WHERE team_name_home = 'Houston Rockets' OR team_name_away = 'Houston Rockets';	81.0
         | 
| @@ -716,7 +716,7 @@ What is the total points in the paint by the Minnesota Timberwolves away when th | |
| 716 | 
             
            What is the average points scored by the Toronto Raptors at home when they had more than 10 second chance points in 1996?	SELECT AVG(g.pts_home) as avg_points  FROM game g  JOIN other_stats os ON g.game_id = os.game_id  WHERE g.team_name_home = 'Toronto Raptors' AND os.pts_2nd_chance_home > 10 AND g.season_id = '21996';	96.458
         | 
| 717 | 
             
            What is the total number of points scored by the Atlanta Hawks at home?	SELECT SUM(pts_home) as total_points  FROM game  WHERE team_name_home = 'Atlanta Hawks';	233546.0
         | 
| 718 | 
             
            How many games did the Boston Celtics lose at home in the 1996 season?	SELECT COUNT(*) as losses  FROM game  WHERE team_name_home = 'Boston Celtics' AND wl_home = 'L' AND season_id = '21996';	30.0
         | 
| 719 | 
            -
            What is the highest field goals made by the Chicago Bulls at home?	SELECT MAX(fgm_home) as max_fgm  FROM game  WHERE team_name_home = 'Chicago Bulls';	0 | 
| 720 | 
             
            How many games did the Cleveland Cavaliers lose away in 1996?	SELECT COUNT(*) as away_losses  FROM game  WHERE team_name_away = 'Cleveland Cavaliers' AND wl_away = 'L' AND season_id = '21996';	24.0
         | 
| 721 | 
             
            What is the total points scored by the Dallas Mavericks away?	SELECT SUM(pts_away) as total_points  FROM game  WHERE team_name_away = 'Dallas Mavericks';	187891.0
         | 
| 722 | 
             
            How many games did the Denver Nuggets win at home in 1996?	SELECT COUNT(*) as home_wins  FROM game  WHERE team_name_home = 'Denver Nuggets' AND wl_home = 'W' AND season_id = '21996';	12.0
         | 
|  | |
| 561 | 
             
            How many home games did the Los Angeles Lakers play in the 2022 season?	SELECT COUNT(*) FROM game WHERE team_name_home = 'Los Angeles Lakers' AND season_id = '22022';	41.0
         | 
| 562 | 
             
            What is the highest combined pts in any game involving the Milwaukee Bucks?	SELECT MAX(pts_home + pts_away) FROM game WHERE team_name_home = 'Milwaukee Bucks' OR team_name_away = 'Milwaukee Bucks';	337.0
         | 
| 563 | 
             
            How many away games did the Chicago Bulls play in the 2020 season?	SELECT COUNT(*) FROM game WHERE team_name_away = 'Chicago Bulls' AND season_id = '22020';	36.0
         | 
| 564 | 
            +
            In which season did the Golden State Warriors have the highest average fg_pct at home?	SELECT season_id, AVG(fg_pct_home) as avg_stat FROM game WHERE team_name_home = 'Golden State Warriors' GROUP BY season_id ORDER BY avg_stat DESC LIMIT 1;  21981|0.5885
         | 
| 565 | 
             
            How many away games did the Chicago Bulls play in the 2001 season?	SELECT COUNT(*) FROM game WHERE team_name_away = 'Chicago Bulls' AND season_id = '22001';	41.0
         | 
| 566 | 
             
            What is the average number of fg_pct in away games by the Miami Heat?	SELECT AVG(fg_pct_away) FROM game WHERE team_name_away = 'Miami Heat';	0.4499279161205765
         | 
| 567 | 
             
            How many away games did the Chicago Bulls play in the 2002 season?	SELECT COUNT(*) FROM game WHERE team_name_away = 'Chicago Bulls' AND season_id = '22002';	41.0
         | 
| 568 | 
             
            What is the highest combined reb in any game involving the Los Angeles Clippers?	SELECT MAX(reb_home + reb_away) FROM game WHERE team_name_home = 'Los Angeles Clippers' OR team_name_away = 'Los Angeles Clippers';	134.0
         | 
| 569 | 
             
            How many home games did the Los Angeles Lakers play in the 2005 season?	SELECT COUNT(*) FROM game WHERE team_name_home = 'Los Angeles Lakers' AND season_id = '22005';	41.0
         | 
| 570 | 
             
            How many home games did the Los Angeles Lakers play in the 2003 season?	SELECT COUNT(*) FROM game WHERE team_name_home = 'Los Angeles Lakers' AND season_id = '22003';	41.0
         | 
| 571 | 
            +
            In which season did the Golden State Warriors have the highest average ast at home?	SELECT season_id, AVG(ast_home) as avg_stat FROM game WHERE team_name_home = 'Golden State Warriors' GROUP BY season_id ORDER BY avg_stat DESC LIMIT 1; 2016.0
         | 
| 572 | 
             
            What is the average number of reb in away games by the Boston Celtics?	SELECT AVG(reb_away) FROM game WHERE team_name_away = 'Boston Celtics';	42.40882509303562
         | 
| 573 | 
             
            In which season did the Los Angeles Clippers have the highest average ast at home?	SELECT season_id, AVG(ast_home) as avg_stat FROM game WHERE team_name_home = 'Los Angeles Clippers' GROUP BY season_id ORDER BY avg_stat DESC LIMIT 1;	21988.0
         | 
| 574 | 
             
            What is the highest combined ast in any game involving the Houston Rockets?	SELECT MAX(ast_home + ast_away) FROM game WHERE team_name_home = 'Houston Rockets' OR team_name_away = 'Houston Rockets';	81.0
         | 
|  | |
| 716 | 
             
            What is the average points scored by the Toronto Raptors at home when they had more than 10 second chance points in 1996?	SELECT AVG(g.pts_home) as avg_points  FROM game g  JOIN other_stats os ON g.game_id = os.game_id  WHERE g.team_name_home = 'Toronto Raptors' AND os.pts_2nd_chance_home > 10 AND g.season_id = '21996';	96.458
         | 
| 717 | 
             
            What is the total number of points scored by the Atlanta Hawks at home?	SELECT SUM(pts_home) as total_points  FROM game  WHERE team_name_home = 'Atlanta Hawks';	233546.0
         | 
| 718 | 
             
            How many games did the Boston Celtics lose at home in the 1996 season?	SELECT COUNT(*) as losses  FROM game  WHERE team_name_home = 'Boston Celtics' AND wl_home = 'L' AND season_id = '21996';	30.0
         | 
| 719 | 
            +
            What is the highest field goals made by the Chicago Bulls at home?	SELECT MAX(fgm_home) as max_fgm  FROM game  WHERE team_name_home = 'Chicago Bulls';	67.0
         | 
| 720 | 
             
            How many games did the Cleveland Cavaliers lose away in 1996?	SELECT COUNT(*) as away_losses  FROM game  WHERE team_name_away = 'Cleveland Cavaliers' AND wl_away = 'L' AND season_id = '21996';	24.0
         | 
| 721 | 
             
            What is the total points scored by the Dallas Mavericks away?	SELECT SUM(pts_away) as total_points  FROM game  WHERE team_name_away = 'Dallas Mavericks';	187891.0
         | 
| 722 | 
             
            How many games did the Denver Nuggets win at home in 1996?	SELECT COUNT(*) as home_wins  FROM game  WHERE team_name_home = 'Denver Nuggets' AND wl_home = 'W' AND season_id = '21996';	12.0
         | 

