import time
time_start_notebook = time.time()


import numpy as np
import pandas as pd

import functools # reduce

import sqlite3
import findspark, pyspark
%load_ext watermark
%watermark -a "Bhishan Poudel" -d -v -m
print()
%watermark -iv

Author: Bhishan Poudel

Python implementation: CPython
Python version       : 3.7.7
IPython version      : 7.16.1

Compiler    : Clang 4.0.1 (tags/RELEASE_401/final)
OS          : Darwin
Release     : 19.6.0
Machine     : x86_64
Processor   : i386
CPU cores   : 4
Architecture: 64bit


pyspark  : 3.1.2
sqlite3  : 2.6.0
findspark: 1.4.2
numpy    : 1.17.5
json     : 2.0.9
autopep8 : 1.5.4
pandas   : 1.0.5


# pyspark settings
import os
import findspark

HOME = os.path.expanduser('~')
findspark.init(HOME + "/Softwares/Spark/spark-3.1.2-bin-hadoop3.2")

# Set spark environments
os.environ['PYSPARK_PYTHON'] = f'{HOME}/opt/miniconda3/envs/spk/bin/python'
os.environ['PYSPARK_DRIVER_PYTHON'] = f'{HOME}/opt/miniconda3/envs/spk/bin/python'


import pyspark
from pyspark import SparkConf, SparkContext, SQLContext
from pyspark.sql.session import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.functions import col as _col
from pyspark.sql.functions import udf # @udf("integer") def myfunc(x,y): return x - y
from pyspark.sql import functions as F # stddev format_number date_format, dayofyear, when
from pyspark.sql.window import Window
from pyspark.sql.types import (StructField,
                               StringType, IntegerType,
                               FloatType,DoubleType,DateType,
                               StructType)
from pyspark.sql.functions import (mean as _mean, min as _min,
                                   max as _max, avg as _avg,
                                   when as _when
                                  )


spark = SparkSession\
        .builder\
        .appName("bhishan")\
        .getOrCreate()

sc = spark.sparkContext
sqlContext = pyspark.SQLContext(sc) # spark_df = sqlContext.createDataFrame(pandas_df)
sc.setLogLevel("INFO")
print(pyspark.__version__)

3.1.2


!pwd

/Volumes/Media/github/Tutorials_and_Lessons/SQL/c02_Book_sql_practice_problems_57


!ls data

Northwind.png          Northwind_small.sqlite orders_data.sql
Northwind.sqlite       northwind.sql          suppliers_products.png


# load the jupyter magic from ipython-sql
%load_ext sql
%config SqlMagic.displaycon=False
%config SqlMagic.feedback=False  # no Done, rows affected
%config SqlMagic.autopandas=True


%sql sqlite:////Volumes/Media/github/Tutorials_and_Lessons/SQL/c02_Book_sql_practice_problems_57/data/Northwind.sqlite


con_dict = %sql l / --connections

print(con_dict)

{'sqlite:////Volumes/Media/github/Tutorials_and_Lessons/SQL/c02_Book_sql_practice_problems_57/data/Northwind.sqlite': <sql.connection.Connection object at 0x7fc52b48ad50>}


%%sql

SELECT *
FROM sqlite_master
WHERE TYPE='table';


# %%sql

# SELECT *
# FROM Order
# LIMIT 3;

# the order table is problematic.


c = 'VINET'


%%sql

SELECT *
FROM Orders
WHERE customerid = '{c}'
LIMIT 3;


Employee              =  %sql SELECT * from Employee;
Category              =  %sql SELECT * from Category;
Customer              =  %sql SELECT * from Customer;
Shipper               =  %sql SELECT * from Shipper;
Supplier              =  %sql SELECT * from Supplier;
# Order               =  %sql SELECT * from Order;
Orders                 =  %sql SELECT * from Orders;
Product               =  %sql SELECT * from Product;
OrderDetail           =  %sql SELECT * from OrderDetail;
CustomerCustomerDemo  =  %sql SELECT * from CustomerCustomerDemo;
CustomerDemographic   =  %sql SELECT * from CustomerDemographic;
Region                =  %sql SELECT * from Region;
Territory             =  %sql SELECT * from Territory;
EmployeeTerritory     =  %sql SELECT * from EmployeeTerritory;


dfs_names = [ 'Employee', 'Category','Customer',
             'Shipper','Supplier','Orders',
             'Product','OrderDetail',
             'CustomerCustomerDemo','CustomerDemographic',
             'Region', 'Territory', 'EmployeeTerritory']

dfs = [ Employee, Category,Customer,Shipper,Supplier,
        Orders, Product,OrderDetail,
       CustomerCustomerDemo,CustomerDemographic,
       Region, Territory, EmployeeTerritory]

for df_name, df in zip(dfs_names,dfs):
    print(df_name, df.shape)
    display(df.head(2))

Employee (9, 18)

Category (8, 3)

Customer (91, 11)

Shipper (3, 3)

Supplier (29, 12)

Orders (830, 14)

Product (77, 10)

OrderDetail (2155, 6)

CustomerCustomerDemo (0, 0)

CustomerDemographic (0, 0)


def spark_df_from_pandas(pandas_df):
    df_dtype = pandas_df.dtypes.astype(str).reset_index()
    df_dtype.columns = ['column','dtype']

    mapping = {'int64'  : 'IntegerType()',
               'float64': 'DoubleType()',
               'bool'   : 'BooleanType()',
               'object' : 'StringType()',
               'datetime64[ns]': 'DateType()',
              }

    df_dtype['dtype'] = df_dtype['dtype'].map(mapping)
    df_dtype['schema'] = "    StructField('" +\
                         df_dtype['column'] + "'," +\
                         df_dtype['dtype'] + ",True),"

    head = 'StructType([\n'
    body = '\n'.join(df_dtype['schema'])
    tail = '\n    ])'

    schema = head + body + tail
    spark_df = sqlContext.createDataFrame(pandas_df, eval(schema))
    return spark_df


sEmployee = spark_df_from_pandas(Employee)
sEmployee.createOrReplaceTempView("Employee")

sCategory = spark_df_from_pandas(Category)
sCategory.createOrReplaceTempView("Category")

sCustomer = spark_df_from_pandas(Customer)
sCustomer.createOrReplaceTempView("Customer")

sShipper = spark_df_from_pandas(Shipper)
sShipper.createOrReplaceTempView("Shipper")

sSupplier = spark_df_from_pandas(Supplier)
sSupplier.createOrReplaceTempView("Supplier")

sOrders = spark_df_from_pandas(Orders)
sOrders.createOrReplaceTempView("Orders")

sProduct = spark_df_from_pandas(Product)
sProduct.createOrReplaceTempView("Product")

sOrderDetail = spark_df_from_pandas(OrderDetail)
sOrderDetail.createOrReplaceTempView("OrderDetail")


try:
    sCustomerCustomerDemo = spark_df_from_pandas(CustomerCustomerDemo)
except:
    pass


# IndexError: list index out of range


try:
    sCustomerCustomerDemo.createOrReplaceTempView("CustomerCustomerDemo")
except:
    pass


try:
    sCustomerDemographic = spark_df_from_pandas(CustomerDemographic)
    sCustomerDemographic.createOrReplaceTempView("CustomerDemographic")
except:
    pass


sRegion = spark_df_from_pandas(Region)
sRegion.createOrReplaceTempView("Region")

sTerritory = spark_df_from_pandas(Territory)
sTerritory.createOrReplaceTempView("Territory")

sEmployeeTerritory = spark_df_from_pandas(EmployeeTerritory)
sEmployeeTerritory.createOrReplaceTempView("EmployeeTerritory")


sOrders.createOrReplaceTempView("Orders") # create Orders table.


sOrders.printSchema()

root
 |-- orderid: integer (nullable = true)
 |-- customerid: string (nullable = true)
 |-- employeeid: integer (nullable = true)
 |-- orderdate: string (nullable = true)
 |-- requireddate: string (nullable = true)
 |-- shippeddate: string (nullable = true)
 |-- shipvia: integer (nullable = true)
 |-- freight: double (nullable = true)
 |-- shipname: string (nullable = true)
 |-- shipaddress: string (nullable = true)
 |-- shipcity: string (nullable = true)
 |-- shipregion: string (nullable = true)
 |-- shippostalcode: string (nullable = true)
 |-- shipcountry: string (nullable = true)


# sEmployee.show(2)


# spark.sql('select * from Employee limit 2').show()


Category.head(2)


Product.head(2)


Product[['CategoryId']]\
.merge(Category[['Id','CategoryName']],
       left_on='CategoryId',
       right_on='Id')\
['CategoryName'].value_counts(ascending=False)

Confections       13
Seafood           12
Condiments        12
Beverages         12
Dairy Products    10
Grains/Cereals     7
Meat/Poultry       6
Produce            5
Name: CategoryName, dtype: int64


%%sql

SELECT CategoryName,
       count(CategoryName)
FROM Product P
INNER JOIN Category C ON P.CategoryId = C.Id
GROUP BY CategoryName
ORDER BY count(CategoryName) DESC;


q = """
select CategoryName, count(CategoryName)
from Product P inner join Category C
on P.CategoryId = C.Id
group by CategoryName
order by count(CategoryName) desc
"""

spark.sql(q).show()

+--------------+-------------------+
|  CategoryName|count(CategoryName)|
+--------------+-------------------+
|   Confections|                 13|
|     Beverages|                 12|
|       Seafood|                 12|
|    Condiments|                 12|
|Dairy Products|                 10|
|Grains/Cereals|                  7|
|  Meat/Poultry|                  6|
|       Produce|                  5|
+--------------+-------------------+


sProduct.select(['CategoryId'])\
.join(sCategory.withColumnRenamed('Id','CategoryId')\
               .select('CategoryId','CategoryName'),
       on='CategoryId')\
.groupby('CategoryName')\
.count()\
.orderBy('count',ascending=False)\
.show()

+--------------+-----+
|  CategoryName|count|
+--------------+-----+
|   Confections|   13|
|     Beverages|   12|
|       Seafood|   12|
|    Condiments|   12|
|Dairy Products|   10|
|Grains/Cereals|    7|
|  Meat/Poultry|    6|
|       Produce|    5|
+--------------+-----+


Customer.head(2)


Customer.groupby(['Country', 'City'])['Id']\
.count().rename('count').reset_index().head(2)


%%sql

SELECT Country,
       City,
       count(*) AS COUNT
FROM Customer
GROUP BY Country,
         City
LIMIT 2;


sCustomer.groupby(['Country', 'City']).count().show(2)

+-------+---------+-----+
|Country|     City|count|
+-------+---------+-----+
|Belgium|Charleroi|    1|
|Germany|  Münster|    1|
+-------+---------+-----+
only showing top 2 rows


Product.head(2)


Product[Product.UnitsInStock < Product.ReorderLevel].sort_values('Id')\
[['Id','ProductName','UnitsInStock','ReorderLevel']]


%%sql

SELECT Id,
       ProductName,
       UnitsInStock,
       ReorderLevel
FROM Product
WHERE UnitsInStock < ReorderLevel
ORDER BY Id;


# spark.sql(q).show()


sProduct[sProduct.UnitsInStock < sProduct.ReorderLevel].orderBy('Id')\
[['Id','ProductName','UnitsInStock','ReorderLevel']]

DataFrame[Id: int, ProductName: string, UnitsInStock: int, ReorderLevel: int]


Product.head(2)


cond1 = (Product['UnitsInStock'] + Product['UnitsOnOrder']) <= Product['ReorderLevel']

cond2 = Product['Discontinued'] == 0
cond = cond1 & cond2

Product[cond][['UnitsInStock', 'UnitsOnOrder', 'ReorderLevel', 'Discontinued']]


%%sql

SELECT UnitsInStock,
       UnitsOnOrder,
       ReorderLevel,
       Discontinued
FROM Product
WHERE (UnitsInStock + UnitsOnOrder) <= ReorderLevel
  AND Discontinued = 0;


cond1 = (sProduct['UnitsInStock'] + 
         sProduct['UnitsOnOrder']
        ) <= sProduct['ReorderLevel']

cond2 = sProduct['Discontinued'] == 0
cond = cond1 & cond2

sProduct[cond][['UnitsInStock', 'UnitsOnOrder',
                'ReorderLevel', 'Discontinued']].show()

+------------+------------+------------+------------+
|UnitsInStock|UnitsOnOrder|ReorderLevel|Discontinued|
+------------+------------+------------+------------+
|          10|           0|          15|           0|
|          15|          10|          30|           0|
+------------+------------+------------+------------+


Customer.head(2)


Customer[['Id','CompanyName','Region']].sort_values(['Region','Id']).head(4)


%%sql
-- Note: we need two same columns.

SELECT Id,
       CompanyName,
       Region
FROM Customer
ORDER BY region IS NULL,
                   region,
                   id;

-- case when Region is null then 1 else 0 end asc, region asc,  Id asc;


sCustomer.select('Id', 'CompanyName', 'Region').orderBy(
    sCustomer.Region.asc_nulls_last(), 'Id').show(3)

+-----+--------------------+------+
|   Id|         CompanyName|Region|
+-----+--------------------+------+
|OLDWO|Old World Delicat...|    AK|
|BOTTM|Bottom-Dollar Mar...|    BC|
|LAUGB|Laughing Bacchus ...|    BC|
+-----+--------------------+------+
only showing top 3 rows


Orders.head(2)


# good
Orders.groupby('shipcountry')['freight'].mean().rename('avg_freight').nlargest(3).reset_index()


# good (Named Aggregation pandas 1.0)
Orders.groupby('shipcountry').agg(
    avg_freight = ('freight','mean')
).nlargest(3,'avg_freight')


# bad
Orders.groupby('shipcountry')['freight'].mean().sort_values(ascending=False).head(3)

shipcountry
Austria    184.787501
Ireland    145.012629
USA        112.879427
Name: freight, dtype: float64


%%sql
select * from Orders limit 2;


%%sql

SELECT shipcountry,
       avg(freight)
FROM Orders
GROUP BY shipcountry;


%%sql

SELECT shipcountry,
       avg(freight)
FROM Orders
GROUP BY shipcountry
ORDER BY avg(freight) DESC
LIMIT 3;


sOrders.groupby('shipcountry')\
.agg({'freight':'avg'})\
.orderBy('avg(freight)',ascending=False)\
.limit(3)\
.show()

+-----------+------------------+
|shipcountry|      avg(freight)|
+-----------+------------------+
|    Austria|     184.787500619|
|    Ireland|145.01262900000003|
|        USA|112.87942658734427|
+-----------+------------------+


spark.sql(q.replace('Orders','Order')).show()

+--------------+-------------------+
|  CategoryName|count(CategoryName)|
+--------------+-------------------+
|   Confections|                 13|
|       Seafood|                 12|
|     Beverages|                 12|
|    Condiments|                 12|
|Dairy Products|                 10|
|Grains/Cereals|                  7|
|  Meat/Poultry|                  6|
|       Produce|                  5|
+--------------+-------------------+


Orders.head(2)


Orders['orderdate'] = pd.to_datetime(Orders['orderdate'])
Orders['requireddate'] = pd.to_datetime(Orders['requireddate'])
Orders['shippeddate'] = pd.to_datetime(Orders['shippeddate'])

Orders.head(2)


Orders.orderdate.apply([min,max])

min   1996-07-04
max   1998-05-06
Name: orderdate, dtype: datetime64[ns]


# I have altered data for Order so I will take 1997 instead of 2015.


Orders[Orders.orderdate >= '1997-01-01'].head(2)


df1 = Orders[Orders.orderdate.between('1997-01-01','1998-01-01')]
df1.head(2).append(df1.tail(2))
# between is both inclusive


df1 = Orders[ (Orders.orderdate >= '1997-01-01') &
             (Orders.orderdate < '1998-01-01')]
df1.head(2).append(df1.tail(2))


df1.groupby('shipcountry')[['freight']].mean().sort_values('freight',ascending=False)


Orders[ (Orders.orderdate >= '1997-01-01') &
       (Orders.orderdate < '1998-01-01')]\
.groupby('shipcountry')\
.agg(avg_freight = ('freight','mean')
).nlargest(3,'avg_freight')


%%sql

SELECT *
FROM Orders
WHERE orderdate BETWEEN '1997-01-01' AND '1998-01-01'
LIMIT 3;


%%sql
-- without casting as date
-- between is both end inclusive.

SELECT shipcountry,
       avg(freight)
FROM Orders
WHERE orderdate BETWEEN '1997-01-01' AND '1997-12-31'
GROUP BY shipcountry
ORDER BY avg(freight) DESC
LIMIT 3;


%%sql
-- using date

SELECT shipcountry,
       avg(freight)
FROM Orders
WHERE cast(orderdate AS date) >= cast('1997-01-01' AS date)
  AND cast(orderdate AS date) < cast('1998-01-01' AS date)
GROUP BY shipcountry
ORDER BY avg(freight) DESC
LIMIT 3;

(sqlite3.OperationalError) near "avg": syntax error
[SQL: -- using date SELECT shipcountry,
       avg(freight)
FROM Orders
WHERE cast(orderdate AS date) >= cast('1997-01-01' AS date)
  AND cast(orderdate AS date) < cast('1998-01-01' AS date)
GROUP BY shipcountry
ORDER BY avg(freight) DESC
LIMIT 3;]
(Background on this error at: http://sqlalche.me/e/13/e3q8)


sOrders\
[ (sOrders.orderdate >= '1997-01-01') &
  (sOrders.orderdate < '1998-01-01')]\
.groupby('shipcountry')\
.agg({'freight':'avg'})\
.orderBy('avg(freight)',ascending=False)\
.limit(3)\
.show()

+-----------+------------------+
|shipcountry|      avg(freight)|
+-----------+------------------+
|    Austria|178.36428571714285|
|Switzerland|     117.177499675|
|     Sweden|105.15999935000002|
+-----------+------------------+


# This question simply asks not to use 'between' operator since 
# it is both inclusive.


# we only care sorted data last 12 months.


Orders.head(2)


Orders.orderdate.max()

Timestamp('1998-05-06 00:00:00')


Orders.orderdate.max() - pd.Timedelta('12 M') # 12 minutes before

Timestamp('1998-05-05 23:48:00')


Orders.orderdate.max() - np.timedelta64(12,'M')

Timestamp('1997-05-05 18:10:48')


Orders.orderdate.max() - pd.DateOffset(months=12)

Timestamp('1997-05-06 00:00:00')


pd.to_datetime('2020-12-31') - pd.DateOffset(months=12)

Timestamp('2019-12-31 00:00:00')


cond1 = Orders.orderdate <= Orders.orderdate.max()
cond2 = Orders.orderdate > (Orders.orderdate.max() -
                           pd.DateOffset(months=12))
cond = cond1 & cond2

df1 = Orders[cond]
df1.head(2)


# Orders[Orders.orderdate == Orders.orderdate.max()].head(2)


Orders[cond]\
.groupby('shipcountry')\
.agg(avg_freight = ('freight','mean')
).nlargest(3,'avg_freight')


%%sql

SELECT date(orderdate)
FROM Orders
LIMIT 3;


%%sql

SELECT date(orderdate, '-12 month')
FROM Orders
LIMIT 3;


%%sql

SELECT max(orderdate)
FROM Orders
LIMIT 3;


Orders.orderdate.max()

Timestamp('1998-05-06 00:00:00')


%%sql

SELECT date(max(orderdate), '-12 month')
FROM Orders
LIMIT 3;


%%sql

SELECT shipcountry,
       round(avg(freight), 2)
FROM Orders
WHERE date(orderdate) >=
    (SELECT date(max(orderdate), '-12 month')
     FROM Orders)
  AND date(orderdate) <
    (SELECT date(max(orderdate))
     FROM Orders)
GROUP BY shipcountry
ORDER BY avg(freight) DESC
LIMIT 3;


sdf = sOrders[['orderdate','shipcountry','freight']]
sdf.show(2)

+----------+-----------+----------+
| orderdate|shipcountry|   freight|
+----------+-----------+----------+
|1996-07-04|     France|32.3800011|
|1996-07-05|    Germany|11.6099997|
+----------+-----------+----------+
only showing top 2 rows


from datetime import datetime
from dateutil.relativedelta import relativedelta

orderdate_max_str = sOrders.agg({'orderdate':'max'}).collect()[0][0]

orderdate_max = datetime.strptime(orderdate_max_str,'%Y-%m-%d')
orderdate_max

datetime.datetime(1998, 5, 6, 0, 0)


from datetime import datetime
from dateutil.relativedelta import relativedelta

last_year = orderdate_max - relativedelta(years=1)
last_year

datetime.datetime(1997, 5, 6, 0, 0)


orderdate_max_str = sOrders.agg({'orderdate':'max'}).collect()[0][0]
orderdate_max = datetime.strptime(orderdate_max_str,'%Y-%m-%d')
last_year = orderdate_max - relativedelta(years=1)

cond1 = sOrders.orderdate <= orderdate_max
cond2 = sOrders.orderdate > last_year
cond = cond1 & cond2

sOrders[cond]\
.groupby('shipcountry')\
.agg({'freight':'avg'})\
.orderBy('avg(freight)',ascending=False)\
.withColumn('avg_freight', F.round(F.col('avg(freight)'),2))\
.drop('avg(freight)')\
.limit(3)\
.show()

+-----------+-----------+
|shipcountry|avg_freight|
+-----------+-----------+
|    Ireland|     200.21|
|    Austria|     186.46|
|        USA|      119.3|
+-----------+-----------+


# spark.sql(q).show() # this does not work because of function date.


# from ER diagram, we see we need to join 4 tables:
# products order_detals, orders, employees.


Product.head(2)


OrderDetail.head(2)


Orders.head(2)


Employee.head(2)


p_od = (Product[['Id','ProductName']]\
    .rename(columns={'Id': 'ProductId'})
.join(
    OrderDetail[['OrderId','ProductId','Quantity']]
    ,on='ProductId',lsuffix='',rsuffix='_od'
    ).drop('ProductId_od',axis=1)
)

p_od.head(2)


Orders[['orderid','employeeid']]\
.rename(columns={'orderid':'OrderID',
                 'employeeid':'EmployeeID'})\
.head(2)


p_od_o = p_od.join(
Orders[['orderid','employeeid']]\
.rename(columns={'orderid':'OrderID',
                 'employeeid':'EmployeeID'}))

p_od_o.head(2)


Employee[['Id','LastName']]\
.rename(columns={'Id':'EmployeeID'})\
.head(2)


p_od_o_e = p_od_o.join(
Employee[['Id','LastName']]\
.rename(columns={'Id':'EmployeeID'}),
on='EmployeeID',lsuffix='',rsuffix='_emp'
)\
.sort_values(['OrderID','ProductId'])\
[['EmployeeID','LastName','OrderID','ProductName','Quantity']]

p_od_o_e.head(2)


%%sql
-- look at different tables and write id of table below:
select * from Orders limit 1;


"""
Product.Id
OrderDetail.orderid productid
Orders orderid employeeid quantity
Employee.Id LastName
""";


%%sql

SELECT e.id AS EmployeeId,
       e.lastname,
       o.orderid AS OrderId,
       p.productname,
       od.quantity
FROM Product p
JOIN OrderDetail od ON od.productid = p.id
JOIN Orders o ON o.orderid = od.orderid
JOIN Employee e ON e.id = o.employeeid
ORDER BY o.orderid,
         od.productid
LIMIT 3;


spark.sql(q).show()

+--------------+-------------------+
|  CategoryName|count(CategoryName)|
+--------------+-------------------+
|   Confections|                 13|
|    Condiments|                 12|
|     Beverages|                 12|
|       Seafood|                 12|
|Dairy Products|                 10|
|Grains/Cereals|                  7|
|  Meat/Poultry|                  6|
|       Produce|                  5|
+--------------+-------------------+


Customer.head(2)


Orders.head(2)


np.setdiff1d(Customer['Id'], Orders['customerid'])

array(['FISSA', 'PARIS'], dtype=object)


set(Customer['Id']).difference(Orders['customerid'])

{'FISSA', 'PARIS'}


%%sql
select * from Orders limit 1;


# Customer Id 
# Orders customerid


%%sql
-- We need left join (default is inner join)

SELECT c.id,
       o.customerid
FROM Customer c
LEFT JOIN Orders o ON o.customerid = c.id
WHERE o.customerid IS NULL;


Orders.head(2)
# NOTE: order table has column customerid but these are not
# all the customers, all the ids of customers are in table Customer.


Orders.shape

(830, 14)


Orders.employeeid.value_counts().head(2)

4    156
3    127
Name: employeeid, dtype: int64


Employee[Employee.Id==4].head(2)


Customer.head(2)


np.setdiff1d(Customer['Id'],
             Orders.loc[Orders.employeeid==4, 'customerid']) # .shape

array(['CONSH', 'DUMON', 'FISSA', 'FRANR', 'GROSR', 'LAUGB', 'LAZYK',
       'NORTS', 'PARIS', 'PERIC', 'PRINI', 'SANTG', 'SEVES', 'SPECD',
       'THEBI', 'VINET'], dtype=object)


%%sql

SELECT *
FROM Orders
LIMIT 2;


# Customer:  Id 
# Orders: customerid employeeid


%%sql

SELECT c.Id,
       o.customerid
FROM Customer c
LEFT JOIN Orders o ON (o.customerid = c.id
                       AND o.employeeid = 4)
WHERE o.customerid IS NULL;


spark.sql(q).show()

+--------------+-------------------+
|  CategoryName|count(CategoryName)|
+--------------+-------------------+
|   Confections|                 13|
|       Seafood|                 12|
|    Condiments|                 12|
|     Beverages|                 12|
|Dairy Products|                 10|
|Grains/Cereals|                  7|
|  Meat/Poultry|                  6|
|       Produce|                  5|
+--------------+-------------------+


sCustomer[['Id']].show(2)

+-----+
|   Id|
+-----+
|ALFKI|
|ANATR|
+-----+
only showing top 2 rows


sOrders[sOrders.employeeid==4].select('customerid').show(2)

+----------+
|customerid|
+----------+
|     HANAR|
|     SUPRD|
+----------+
only showing top 2 rows


sCustomer.select('Id').subtract(
sOrders[sOrders.employeeid==4].select('customerid')
).show()

+-----+
|   Id|
+-----+
|FRANR|
|SEVES|
|PARIS|
|VINET|
|SPECD|
|PERIC|
|PRINI|
|GROSR|
|CONSH|
|LAZYK|
|FISSA|
|THEBI|
|LAUGB|
|SANTG|
|NORTS|
|DUMON|
+-----+


time_taken = time.time() - time_start_notebook
h,m = divmod(time_taken,60*60)
print('Time taken: {:.0f} hr {:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))

Time taken: 0 hr 0 min 31 secs

	orderid	customerid	employeeid	orderdate	requireddate	shippeddate	shipvia	freight	shipname	shipaddress	shipcity	shipregion	shippostalcode	shipcountry
0	10248	VINET	5	1996-07-04	1996-08-01	1996-07-16	3	32.380001	Vins et alcools Chevalier	59 rue de l'Abbaye	Reims	None	51100	France
1	10274	VINET	6	1996-08-06	1996-09-03	1996-08-16	1	6.010000	Vins et alcools Chevalier	59 rue de l'Abbaye	Reims	None	51100	France
2	10295	VINET	2	1996-09-02	1996-09-30	1996-09-10	2	1.150000	Vins et alcools Chevalier	59 rue de l'Abbaye	Reims	None	51100	France

	Id	LastName	FirstName	Title	TitleOfCourtesy	BirthDate	HireDate	Address	City	Region	PostalCode	Country	HomePhone	Extension	Photo	Notes	ReportsTo	PhotoPath
0	1	Davolio	Nancy	Sales Representative	Ms.	1948-12-08	1992-05-01	507 - 20th Ave. E. Apt. 2A	Seattle	WA	98122	USA	(206) 555-9857	5467	None	Education includes a BA in psychology from Col...	2.0	http://accweb/emmployees/davolio.bmp
1	2	Fuller	Andrew	Vice President, Sales	Dr.	1952-02-19	1992-08-14	908 W. Capital Way	Tacoma	WA	98401	USA	(206) 555-9482	3457	None	Andrew received his BTS commercial in 1974 and...	NaN	http://accweb/emmployees/fuller.bmp

	Id	CompanyName	ContactName	ContactTitle	Address	City	Region	PostalCode	Country	Phone	Fax
0	ALFKI	Alfreds Futterkiste	Maria Anders	Sales Representative	Obere Str. 57	Berlin	None	12209	Germany	030-0074321	030-0076545
1	ANATR	Ana Trujillo Emparedados y helados	Ana Trujillo	Owner	Avda. de la Constitución 2222	México D.F.	None	05021	Mexico	(5) 555-4729	(5) 555-3745

	Id	CompanyName	Phone
0	1	Speedy Express	(503) 555-9831
1	2	United Package	(503) 555-3199

	Id	CompanyName	ContactName	ContactTitle	Address	City	Region	PostalCode	Country	Phone	Fax	HomePage
0	1	Exotic Liquids	Charlotte Cooper	Purchasing Manager	49 Gilbert St.	London	None	EC1 4SD	UK	(171) 555-2222	None	None
1	2	New Orleans Cajun Delights	Shelley Burke	Order Administrator	P.O. Box 78934	New Orleans	LA	70117	USA	(100) 555-4822	None	#CAJUN.HTM#

Table of Contents

References¶

Imports¶

Load the database¶

Get pandas dataframes¶

PySpark dataframes¶

Intermediate SQL Questions¶

20. Categories, and the total products in each category¶

21. Total customers per country/city¶

22. Products that need reordering¶

23. Products that need reordering, continued¶

24. Customer list by region¶

25. High freight charges¶

26. High freight charges - 1997¶

27. High freight charges with between¶

28. High freight charges - last year¶

29. Inventory list¶

30. Customers with no orders¶

31. Customers with no orders for EmployeeID 4¶

Time Taken¶

	type	name	tbl_name	rootpage	sql
0	table	Employee	Employee	2	CREATE TABLE "Employee" \n(\n "Id" INTEGER PR...
1	table	Category	Category	3	CREATE TABLE "Category" \n(\n "Id" INTEGER PR...
2	table	Customer	Customer	4	CREATE TABLE "Customer" \n(\n "Id" VARCHAR(80...
3	table	Shipper	Shipper	8	CREATE TABLE "Shipper" \n(\n "Id" INTEGER PRI...
4	table	Supplier	Supplier	9	CREATE TABLE "Supplier" \n(\n "Id" INTEGER PR...
5	table	Product	Product	12	CREATE TABLE "Product" \n(\n "Id" INTEGER PRI...
6	table	OrderDetail	OrderDetail	14	CREATE TABLE "OrderDetail" \n(\n "Id" VARCHAR...
7	table	CustomerCustomerDemo	CustomerCustomerDemo	16	CREATE TABLE "CustomerCustomerDemo" \n(\n "Id...
8	table	CustomerDemographic	CustomerDemographic	18	CREATE TABLE "CustomerDemographic" \n(\n "Id"...
9	table	Region	Region	21	CREATE TABLE "Region" \n(\n "Id" INTEGER PRIM...
10	table	Territory	Territory	22	CREATE TABLE "Territory" \n(\n "Id" VARCHAR(8...
11	table	EmployeeTerritory	EmployeeTerritory	24	CREATE TABLE "EmployeeTerritory" \n(\n "Id" V...
12	table	Order	Order	11	CREATE TABLE "Order" (\n"index" INTEGER,\n "O...
13	table	orders	orders	155	CREATE TABLE orders (\n orderid smallint NO...

	Id	CategoryName	Description
0	1	Beverages	Soft drinks, coffees, teas, beers, and ales
1	2	Condiments	Sweet and savory sauces, relishes, spreads, an...

	Id	ProductName	SupplierId	CategoryId	QuantityPerUnit	UnitPrice	UnitsInStock	UnitsOnOrder	ReorderLevel	Discontinued
0	1	Chai	1	1	10 boxes x 20 bags	18.0	39	0	10	0
1	2	Chang	1	1	24 - 12 oz bottles	19.0	17	40	25	0

	CategoryName	count(CategoryName)
0	Confections	13
1	Seafood	12
2	Condiments	12
3	Beverages	12
4	Dairy Products	10
5	Grains/Cereals	7
6	Meat/Poultry	6
7	Produce	5

	Id	ProductName	UnitsInStock	ReorderLevel
1	2	Chang	17	25
2	3	Aniseed Syrup	13	25
10	11	Queso Cabrales	22	30
20	21	Sir Rodney's Scones	3	5
29	30	Nord-Ost Matjeshering	10	15
30	31	Gorgonzola Telino	0	20
31	32	Mascarpone Fabioli	9	25
36	37	Gravad lax	11	25
42	43	Ipoh Coffee	17	25
44	45	Rogede sild	5	15
47	48	Chocolade	15	25
48	49	Maxilaku	10	15
55	56	Gnocchi di nonna Alice	21	30
63	64	Wimmers gute Semmelknödel	22	30
65	66	Louisiana Hot Spiced Okra	4	20
67	68	Scottish Longbreads	6	15
69	70	Outback Lager	15	30
73	74	Longlife Tofu	4	5

	Id	CompanyName	Region
54	OLDWO	Old World Delicatessen	AK
9	BOTTM	Bottom-Dollar Markets	BC
41	LAUGB	Laughing Bacchus Wine Cellars	BC
44	LETSS	Let's Stop N Shop	CA

	shipcountry	avg(freight)
0	Argentina	37.411250
1	Austria	184.787501
2	Belgium	67.375788
3	Brazil	58.797470
4	Canada	73.269666
5	Denmark	77.566111
6	Finland	41.404091
7	France	55.036883
8	Germany	92.485902
9	Ireland	145.012629
10	Italy	30.872857
11	Mexico	40.099285
12	Norway	45.916666
13	Poland	25.105715
14	Portugal	49.502308
15	Spain	37.473478
16	Sweden	87.502703
17	Switzerland	76.029444
18	UK	52.754821
19	USA	112.879427
20	Venezuela	59.460435

	orderid	customerid	employeeid	orderdate	requireddate	shippeddate	shipvia	freight	shipname	shipaddress	shipcity	shipregion	shippostalcode	shipcountry
152	10400	EASTC	1	1997-01-01	1997-01-29	1997-01-16	3	83.93	Eastern Connection	35 King George	London	None	WX3 6FW	UK
153	10401	RATTC	1	1997-01-01	1997-01-29	1997-01-10	1	12.51	Rattlesnake Canyon Grocery	2817 Milton Dr.	Albuquerque	NM	87110	USA

	freight
shipcountry
Austria	178.364286
Switzerland	117.177500
Sweden	105.159999
Canada	99.279999
Ireland	98.076999
Germany	97.383594
USA	96.989501
Denmark	94.622727
Venezuela	71.044500
Belgium	65.782857
France	63.275898
Brazil	53.000238
Mexico	52.959166
Poland	52.220001
Finland	48.931539
Spain	42.419999
UK	39.618333
Portugal	39.212857
Italy	29.378000
Norway	26.005000
Argentina	19.610000

	orderid	customerid	employeeid	orderdate	requireddate	shippeddate	shipvia	freight	shipname	shipaddress	shipcity	shipregion	shippostalcode	shipcountry
281	10529	MAISD	5	1997-05-07	1997-06-04	1997-05-09	2	66.690002	Maison Dewey	Rue Joseph-Bens 532	Bruxelles	None	B-1180	Belgium
282	10530	PICCO	3	1997-05-08	1997-06-05	1997-05-12	2	339.220001	Piccolo und mehr	Geislweg 14	Salzburg	None	5020	Austria

	Id	OrderId	ProductId	UnitPrice	Quantity	Discount
0	10248/11	10248	11	14.0	12	0.0
1	10248/42	10248	42	9.8	10	0.0

	Id	RegionDescription
0	1	Eastern
1	2	Western

	Id	TerritoryDescription	RegionId
0	01581	Westboro	1
1	01730	Bedford	1

	Id	EmployeeId	TerritoryId
0	1/06897	1	06897
1	1/19713	1	19713

	EmployeeId	LastName	OrderId	ProductName	Quantity
0	5	Buchanan	10248	Queso Cabrales	12
1	5	Buchanan	10248	Singaporean Hokkien Fried Mee	10
2	5	Buchanan	10248	Mozzarella di Giovanni	5

	Id	customerid
0	CONSH	None
1	DUMON	None
2	FISSA	None
3	FRANR	None
4	GROSR	None
5	LAUGB	None
6	LAZYK	None
7	NORTS	None
8	PARIS	None
9	PERIC	None
10	PRINI	None
11	SANTG	None
12	SEVES	None
13	SPECD	None
14	THEBI	None
15	VINET	None