This tutorial is based on the book "SQL Practice Problems". The book uses fake database created by Microsoft called Northwind and modifies it by adding new entries. The book provides the answer on MS SQL server 2016.
Notes:
The Order data is corrupted. But it can be recreated using Yugabyte github.
import time
time_start_notebook = time.time()
import numpy as np
import pandas as pd
import functools # reduce
import sqlite3
import findspark, pyspark
%load_ext watermark
%watermark -a "Bhishan Poudel" -d -v -m
print()
%watermark -iv
Author: Bhishan Poudel Python implementation: CPython Python version : 3.7.7 IPython version : 7.16.1 Compiler : Clang 4.0.1 (tags/RELEASE_401/final) OS : Darwin Release : 19.6.0 Machine : x86_64 Processor : i386 CPU cores : 4 Architecture: 64bit numpy : 1.17.5 autopep8 : 1.5.4 sqlite3 : 2.6.0 findspark: 1.4.2 pandas : 1.0.5 json : 2.0.9 pyspark : 3.1.2
# pyspark settings
import os
import findspark
HOME = os.path.expanduser('~')
findspark.init(HOME + "/Softwares/Spark/spark-3.1.2-bin-hadoop3.2")
# Set spark environments
os.environ['PYSPARK_PYTHON'] = f'{HOME}/opt/miniconda3/envs/spk/bin/python'
os.environ['PYSPARK_DRIVER_PYTHON'] = f'{HOME}/opt/miniconda3/envs/spk/bin/python'
import pyspark
from pyspark import SparkConf, SparkContext, SQLContext
from pyspark.sql.session import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.functions import col as _col
from pyspark.sql.functions import udf # @udf("integer") def myfunc(x,y): return x - y
from pyspark.sql import functions as F # stddev format_number date_format, dayofyear, when
from pyspark.sql.window import Window
from pyspark.sql.types import (StructField,
StringType, IntegerType,
FloatType,DoubleType,DateType,
StructType)
from pyspark.sql.functions import (mean as _mean, min as _min,
max as _max, avg as _avg,
when as _when
)
spark = SparkSession\
.builder\
.appName("bhishan")\
.getOrCreate()
sc = spark.sparkContext
sqlContext = pyspark.SQLContext(sc) # spark_df = sqlContext.createDataFrame(pandas_df)
sc.setLogLevel("INFO")
print(pyspark.__version__)
3.1.2
# load the jupyter magic from ipython-sql
%load_ext sql
%config SqlMagic.displaycon=False
%config SqlMagic.feedback=False # no Done, rows affected
%config SqlMagic.autopandas=True
%sql sqlite:////Volumes/Media/github/Tutorials_and_Lessons/SQL/c02_Book_sql_practice_problems_57/data/Northwind.sqlite
%%sql
SELECT *
FROM sqlite_master
WHERE TYPE='table';
type | name | tbl_name | rootpage | sql | |
---|---|---|---|---|---|
0 | table | Employee | Employee | 2 | CREATE TABLE "Employee" \n(\n "Id" INTEGER PR... |
1 | table | Category | Category | 3 | CREATE TABLE "Category" \n(\n "Id" INTEGER PR... |
2 | table | Customer | Customer | 4 | CREATE TABLE "Customer" \n(\n "Id" VARCHAR(80... |
3 | table | Shipper | Shipper | 8 | CREATE TABLE "Shipper" \n(\n "Id" INTEGER PRI... |
4 | table | Supplier | Supplier | 9 | CREATE TABLE "Supplier" \n(\n "Id" INTEGER PR... |
5 | table | Product | Product | 12 | CREATE TABLE "Product" \n(\n "Id" INTEGER PRI... |
6 | table | OrderDetail | OrderDetail | 14 | CREATE TABLE "OrderDetail" \n(\n "Id" VARCHAR... |
7 | table | CustomerCustomerDemo | CustomerCustomerDemo | 16 | CREATE TABLE "CustomerCustomerDemo" \n(\n "Id... |
8 | table | CustomerDemographic | CustomerDemographic | 18 | CREATE TABLE "CustomerDemographic" \n(\n "Id"... |
9 | table | Region | Region | 21 | CREATE TABLE "Region" \n(\n "Id" INTEGER PRIM... |
10 | table | Territory | Territory | 22 | CREATE TABLE "Territory" \n(\n "Id" VARCHAR(8... |
11 | table | EmployeeTerritory | EmployeeTerritory | 24 | CREATE TABLE "EmployeeTerritory" \n(\n "Id" V... |
12 | table | Order | Order | 11 | CREATE TABLE "Order" (\n"index" INTEGER,\n "O... |
13 | table | orders | orders | 155 | CREATE TABLE orders (\n orderid smallint NO... |
Employee = %sql SELECT * from Employee;
Category = %sql SELECT * from Category;
Customer = %sql SELECT * from Customer;
Shipper = %sql SELECT * from Shipper;
Supplier = %sql SELECT * from Supplier;
# Order = %sql SELECT * from Order;
Orders = %sql SELECT * from Orders;
Product = %sql SELECT * from Product;
OrderDetail = %sql SELECT * from OrderDetail;
CustomerCustomerDemo = %sql SELECT * from CustomerCustomerDemo;
CustomerDemographic = %sql SELECT * from CustomerDemographic;
Region = %sql SELECT * from Region;
Territory = %sql SELECT * from Territory;
EmployeeTerritory = %sql SELECT * from EmployeeTerritory;
Orders['orderdate'] = pd.to_datetime(Orders['orderdate'])
Orders['requireddate'] = pd.to_datetime(Orders['requireddate'])
Orders['shippeddate'] = pd.to_datetime(Orders['shippeddate'])
Orders.dtypes
orderid int64 customerid object employeeid int64 orderdate datetime64[ns] requireddate datetime64[ns] shippeddate datetime64[ns] shipvia int64 freight float64 shipname object shipaddress object shipcity object shipregion object shippostalcode object shipcountry object dtype: object
dfs_names = [ 'Employee', 'Category','Customer',
'Shipper','Supplier','Orders',
'Product','OrderDetail',
'CustomerCustomerDemo','CustomerDemographic',
'Region', 'Territory', 'EmployeeTerritory']
dfs = [ Employee, Category,Customer,Shipper,Supplier,
Orders, Product,OrderDetail,
CustomerCustomerDemo,CustomerDemographic,
Region, Territory, EmployeeTerritory]
for df_name, df in zip(dfs_names,dfs):
print(df_name, df.shape)
display(df.head(2))
Employee (9, 18)
Id | LastName | FirstName | Title | TitleOfCourtesy | BirthDate | HireDate | Address | City | Region | PostalCode | Country | HomePhone | Extension | Photo | Notes | ReportsTo | PhotoPath | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | Davolio | Nancy | Sales Representative | Ms. | 1948-12-08 | 1992-05-01 | 507 - 20th Ave. E. Apt. 2A | Seattle | WA | 98122 | USA | (206) 555-9857 | 5467 | None | Education includes a BA in psychology from Col... | 2.0 | http://accweb/emmployees/davolio.bmp |
1 | 2 | Fuller | Andrew | Vice President, Sales | Dr. | 1952-02-19 | 1992-08-14 | 908 W. Capital Way | Tacoma | WA | 98401 | USA | (206) 555-9482 | 3457 | None | Andrew received his BTS commercial in 1974 and... | NaN | http://accweb/emmployees/fuller.bmp |
Category (8, 3)
Id | CategoryName | Description | |
---|---|---|---|
0 | 1 | Beverages | Soft drinks, coffees, teas, beers, and ales |
1 | 2 | Condiments | Sweet and savory sauces, relishes, spreads, an... |
Customer (91, 11)
Id | CompanyName | ContactName | ContactTitle | Address | City | Region | PostalCode | Country | Phone | Fax | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | ALFKI | Alfreds Futterkiste | Maria Anders | Sales Representative | Obere Str. 57 | Berlin | None | 12209 | Germany | 030-0074321 | 030-0076545 |
1 | ANATR | Ana Trujillo Emparedados y helados | Ana Trujillo | Owner | Avda. de la Constitución 2222 | México D.F. | None | 05021 | Mexico | (5) 555-4729 | (5) 555-3745 |
Shipper (3, 3)
Id | CompanyName | Phone | |
---|---|---|---|
0 | 1 | Speedy Express | (503) 555-9831 |
1 | 2 | United Package | (503) 555-3199 |
Supplier (29, 12)
Id | CompanyName | ContactName | ContactTitle | Address | City | Region | PostalCode | Country | Phone | Fax | HomePage | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | Exotic Liquids | Charlotte Cooper | Purchasing Manager | 49 Gilbert St. | London | None | EC1 4SD | UK | (171) 555-2222 | None | None |
1 | 2 | New Orleans Cajun Delights | Shelley Burke | Order Administrator | P.O. Box 78934 | New Orleans | LA | 70117 | USA | (100) 555-4822 | None | #CAJUN.HTM# |
Orders (830, 14)
orderid | customerid | employeeid | orderdate | requireddate | shippeddate | shipvia | freight | shipname | shipaddress | shipcity | shipregion | shippostalcode | shipcountry | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 10248 | VINET | 5 | 1996-07-04 | 1996-08-01 | 1996-07-16 | 3 | 32.380001 | Vins et alcools Chevalier | 59 rue de l'Abbaye | Reims | None | 51100 | France |
1 | 10249 | TOMSP | 6 | 1996-07-05 | 1996-08-16 | 1996-07-10 | 1 | 11.610000 | Toms Spezialitäten | Luisenstr. 48 | Münster | None | 44087 | Germany |
Product (77, 10)
Id | ProductName | SupplierId | CategoryId | QuantityPerUnit | UnitPrice | UnitsInStock | UnitsOnOrder | ReorderLevel | Discontinued | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | Chai | 1 | 1 | 10 boxes x 20 bags | 18.0 | 39 | 0 | 10 | 0 |
1 | 2 | Chang | 1 | 1 | 24 - 12 oz bottles | 19.0 | 17 | 40 | 25 | 0 |
OrderDetail (2155, 6)
Id | OrderId | ProductId | UnitPrice | Quantity | Discount | |
---|---|---|---|---|---|---|
0 | 10248/11 | 10248 | 11 | 14.0 | 12 | 0.0 |
1 | 10248/42 | 10248 | 42 | 9.8 | 10 | 0.0 |
CustomerCustomerDemo (0, 0)
CustomerDemographic (0, 0)
Region (4, 2)
Id | RegionDescription | |
---|---|---|
0 | 1 | Eastern |
1 | 2 | Western |
Territory (53, 3)
Id | TerritoryDescription | RegionId | |
---|---|---|---|
0 | 01581 | Westboro | 1 |
1 | 01730 | Bedford | 1 |
EmployeeTerritory (49, 3)
Id | EmployeeId | TerritoryId | |
---|---|---|---|
0 | 1/06897 | 1 | 06897 |
1 | 1/19713 | 1 | 19713 |
def spark_df_from_pandas(pandas_df):
df_dtype = pandas_df.dtypes.astype(str).reset_index()
df_dtype.columns = ['column','dtype']
mapping = {'int64' : 'IntegerType()',
'float64': 'DoubleType()',
'bool' : 'BooleanType()',
'object' : 'StringType()',
'datetime64[ns]': 'DateType()',
}
df_dtype['dtype'] = df_dtype['dtype'].map(mapping)
df_dtype['schema'] = " StructField('" +\
df_dtype['column'] + "'," +\
df_dtype['dtype'] + ",True),"
head = 'StructType([\n'
body = '\n'.join(df_dtype['schema'])
tail = '\n ])'
schema = head + body + tail
spark_df = sqlContext.createDataFrame(pandas_df, eval(schema))
return spark_df
sEmployee = spark_df_from_pandas(Employee)
sEmployee.createOrReplaceTempView("Employee")
sCategory = spark_df_from_pandas(Category)
sCategory.createOrReplaceTempView("Category")
sCustomer = spark_df_from_pandas(Customer)
sCustomer.createOrReplaceTempView("Customer")
sShipper = spark_df_from_pandas(Shipper)
sShipper.createOrReplaceTempView("Shipper")
sSupplier = spark_df_from_pandas(Supplier)
sSupplier.createOrReplaceTempView("Supplier")
sOrders = spark_df_from_pandas(Orders)
sOrders.createOrReplaceTempView("Orders")
sProduct = spark_df_from_pandas(Product)
sProduct.createOrReplaceTempView("Product")
sOrderDetail = spark_df_from_pandas(OrderDetail)
sOrderDetail.createOrReplaceTempView("OrderDetail")
try:
sCustomerCustomerDemo = spark_df_from_pandas(CustomerCustomerDemo)
except:
pass
# IndexError: list index out of range
try:
sCustomerCustomerDemo.createOrReplaceTempView("CustomerCustomerDemo")
except:
pass
try:
sCustomerDemographic = spark_df_from_pandas(CustomerDemographic)
sCustomerDemographic.createOrReplaceTempView("CustomerDemographic")
except:
pass
sRegion = spark_df_from_pandas(Region)
sRegion.createOrReplaceTempView("Region")
sTerritory = spark_df_from_pandas(Territory)
sTerritory.createOrReplaceTempView("Territory")
sEmployeeTerritory = spark_df_from_pandas(EmployeeTerritory)
sEmployeeTerritory.createOrReplaceTempView("EmployeeTerritory")
dfs_names = [ 'Employee', 'Category','Customer',
'Shipper','Supplier','Orders',
'Product','OrderDetail',
'CustomerCustomerDemo','CustomerDemographic',
'Region', 'Territory', 'EmployeeTerritory']
dfs = [ Employee, Category,Customer,Shipper,Supplier,
Orders, Product,OrderDetail,
CustomerCustomerDemo,CustomerDemographic,
Region, Territory, EmployeeTerritory]
for df_name, df in zip(dfs_names,dfs):
print(df_name, df.shape)
display(df.head(2))
Employee (9, 18)
Id | LastName | FirstName | Title | TitleOfCourtesy | BirthDate | HireDate | Address | City | Region | PostalCode | Country | HomePhone | Extension | Photo | Notes | ReportsTo | PhotoPath | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | Davolio | Nancy | Sales Representative | Ms. | 1948-12-08 | 1992-05-01 | 507 - 20th Ave. E. Apt. 2A | Seattle | WA | 98122 | USA | (206) 555-9857 | 5467 | None | Education includes a BA in psychology from Col... | 2.0 | http://accweb/emmployees/davolio.bmp |
1 | 2 | Fuller | Andrew | Vice President, Sales | Dr. | 1952-02-19 | 1992-08-14 | 908 W. Capital Way | Tacoma | WA | 98401 | USA | (206) 555-9482 | 3457 | None | Andrew received his BTS commercial in 1974 and... | NaN | http://accweb/emmployees/fuller.bmp |
Category (8, 3)
Id | CategoryName | Description | |
---|---|---|---|
0 | 1 | Beverages | Soft drinks, coffees, teas, beers, and ales |
1 | 2 | Condiments | Sweet and savory sauces, relishes, spreads, an... |
Customer (91, 11)
Id | CompanyName | ContactName | ContactTitle | Address | City | Region | PostalCode | Country | Phone | Fax | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | ALFKI | Alfreds Futterkiste | Maria Anders | Sales Representative | Obere Str. 57 | Berlin | None | 12209 | Germany | 030-0074321 | 030-0076545 |
1 | ANATR | Ana Trujillo Emparedados y helados | Ana Trujillo | Owner | Avda. de la Constitución 2222 | México D.F. | None | 05021 | Mexico | (5) 555-4729 | (5) 555-3745 |
Shipper (3, 3)
Id | CompanyName | Phone | |
---|---|---|---|
0 | 1 | Speedy Express | (503) 555-9831 |
1 | 2 | United Package | (503) 555-3199 |
Supplier (29, 12)
Id | CompanyName | ContactName | ContactTitle | Address | City | Region | PostalCode | Country | Phone | Fax | HomePage | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | Exotic Liquids | Charlotte Cooper | Purchasing Manager | 49 Gilbert St. | London | None | EC1 4SD | UK | (171) 555-2222 | None | None |
1 | 2 | New Orleans Cajun Delights | Shelley Burke | Order Administrator | P.O. Box 78934 | New Orleans | LA | 70117 | USA | (100) 555-4822 | None | #CAJUN.HTM# |
Orders (830, 14)
orderid | customerid | employeeid | orderdate | requireddate | shippeddate | shipvia | freight | shipname | shipaddress | shipcity | shipregion | shippostalcode | shipcountry | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 10248 | VINET | 5 | 1996-07-04 | 1996-08-01 | 1996-07-16 | 3 | 32.380001 | Vins et alcools Chevalier | 59 rue de l'Abbaye | Reims | None | 51100 | France |
1 | 10249 | TOMSP | 6 | 1996-07-05 | 1996-08-16 | 1996-07-10 | 1 | 11.610000 | Toms Spezialitäten | Luisenstr. 48 | Münster | None | 44087 | Germany |
Product (77, 10)
Id | ProductName | SupplierId | CategoryId | QuantityPerUnit | UnitPrice | UnitsInStock | UnitsOnOrder | ReorderLevel | Discontinued | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | Chai | 1 | 1 | 10 boxes x 20 bags | 18.0 | 39 | 0 | 10 | 0 |
1 | 2 | Chang | 1 | 1 | 24 - 12 oz bottles | 19.0 | 17 | 40 | 25 | 0 |
OrderDetail (2155, 6)
Id | OrderId | ProductId | UnitPrice | Quantity | Discount | |
---|---|---|---|---|---|---|
0 | 10248/11 | 10248 | 11 | 14.0 | 12 | 0.0 |
1 | 10248/42 | 10248 | 42 | 9.8 | 10 | 0.0 |
CustomerCustomerDemo (0, 0)
CustomerDemographic (0, 0)
Region (4, 2)
Id | RegionDescription | |
---|---|---|
0 | 1 | Eastern |
1 | 2 | Western |
Territory (53, 3)
Id | TerritoryDescription | RegionId | |
---|---|---|---|
0 | 01581 | Westboro | 1 |
1 | 01730 | Bedford | 1 |
EmployeeTerritory (49, 3)
Id | EmployeeId | TerritoryId | |
---|---|---|---|
0 | 1/06897 | 1 | 06897 |
1 | 1/19713 | 1 | 19713 |
# pyspark settings
import os
import findspark
HOME = os.path.expanduser('~')
findspark.init(HOME + "/Softwares/Spark/spark-3.1.2-bin-hadoop3.2")
# Set spark environments
os.environ['PYSPARK_PYTHON'] = f'{HOME}/opt/miniconda3/envs/spk/bin/python'
os.environ['PYSPARK_DRIVER_PYTHON'] = f'{HOME}/opt/miniconda3/envs/spk/bin/python'
import pyspark
from pyspark import SparkConf, SparkContext, SQLContext
from pyspark.sql.session import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.functions import col as _col
from pyspark.sql.functions import udf # @udf("integer") def myfunc(x,y): return x - y
from pyspark.sql import functions as F # stddev format_number date_format, dayofyear, when
from pyspark.sql.window import Window
from pyspark.sql.types import (StructField,
StringType, IntegerType,
FloatType,DoubleType,DateType,
StructType)
from pyspark.sql.functions import (mean as _mean, min as _min,
max as _max, avg as _avg,
when as _when
)
spark = SparkSession\
.builder\
.appName("bhishan")\
.getOrCreate()
sc = spark.sparkContext
sqlContext = pyspark.SQLContext(sc) # spark_df = sqlContext.createDataFrame(pandas_df)
sc.setLogLevel("INFO")
print(pyspark.__version__)
3.1.2
def spark_df_from_pandas(pandas_df):
df_dtype = pandas_df.dtypes.astype(str).reset_index()
df_dtype.columns = ['column','dtype']
mapping = {'int64' : 'IntegerType()',
'float64': 'DoubleType()',
'bool' : 'BooleanType()',
'object' : 'StringType()',
'datetime64[ns]': 'DateType()',
}
df_dtype['dtype'] = df_dtype['dtype'].map(mapping)
df_dtype['schema'] = " StructField('" +\
df_dtype['column'] + "'," +\
df_dtype['dtype'] + ",True),"
head = 'StructType([\n'
body = '\n'.join(df_dtype['schema'])
tail = '\n ])'
schema = head + body + tail
spark_df = sqlContext.createDataFrame(pandas_df, eval(schema))
return spark_df
sEmployee = spark_df_from_pandas(Employee)
sEmployee.createOrReplaceTempView("Employee")
sCategory = spark_df_from_pandas(Category)
sCategory.createOrReplaceTempView("Category")
sCustomer = spark_df_from_pandas(Customer)
sCustomer.createOrReplaceTempView("Customer")
sShipper = spark_df_from_pandas(Shipper)
sShipper.createOrReplaceTempView("Shipper")
sSupplier = spark_df_from_pandas(Supplier)
sSupplier.createOrReplaceTempView("Supplier")
sOrders = spark_df_from_pandas(Orders)
sOrders.createOrReplaceTempView("Orders")
sProduct = spark_df_from_pandas(Product)
sProduct.createOrReplaceTempView("Product")
sOrderDetail = spark_df_from_pandas(OrderDetail)
sOrderDetail.createOrReplaceTempView("OrderDetail")
try:
sCustomerCustomerDemo = spark_df_from_pandas(CustomerCustomerDemo)
except:
pass
# IndexError: list index out of range
try:
sCustomerCustomerDemo.createOrReplaceTempView("CustomerCustomerDemo")
except:
pass
sRegion = spark_df_from_pandas(Region)
sRegion.createOrReplaceTempView("Region")
sTerritory = spark_df_from_pandas(Territory)
sTerritory.createOrReplaceTempView("Territory")
sEmployeeTerritory = spark_df_from_pandas(EmployeeTerritory)
sEmployeeTerritory.createOrReplaceTempView("EmployeeTerritory")
We want to send all of our high-value customers a special VIP gift. We're defining high-value customers as those who've made at least 1 order with a total value (not including the discount) equal to $10,000 or more. We only want to consider orders made in the year 1997.
OrderDetail.head(2)
Id | OrderId | ProductId | UnitPrice | Quantity | Discount | |
---|---|---|---|---|---|---|
0 | 10248/11 | 10248 | 11 | 14.0 | 12 | 0.0 |
1 | 10248/42 | 10248 | 42 | 9.8 | 10 | 0.0 |
OrderDetail['Discount'].max() # discount is percentage.
0.25
OrderDetail['OrderAmount'] = (OrderDetail['UnitPrice']
* OrderDetail['Quantity']
* (1 - OrderDetail['Discount'])
)
OrderDetail.head(2)
Id | OrderId | ProductId | UnitPrice | Quantity | Discount | OrderAmount | |
---|---|---|---|---|---|---|---|
0 | 10248/11 | 10248 | 11 | 14.0 | 12 | 0.0 | 168.0 |
1 | 10248/42 | 10248 | 42 | 9.8 | 10 | 0.0 | 98.0 |
# find total order amount per orderid
OrderDetail['TotalOrderAmount'] = OrderDetail.groupby('OrderId')['OrderAmount'].transform('sum')
OrderDetail.head(2)
Id | OrderId | ProductId | UnitPrice | Quantity | Discount | OrderAmount | TotalOrderAmount | |
---|---|---|---|---|---|---|---|---|
0 | 10248/11 | 10248 | 11 | 14.0 | 12 | 0.0 | 168.0 | 440.0 |
1 | 10248/42 | 10248 | 42 | 9.8 | 10 | 0.0 | 98.0 | 440.0 |
Orders.head(2)
orderid | customerid | employeeid | orderdate | requireddate | shippeddate | shipvia | freight | shipname | shipaddress | shipcity | shipregion | shippostalcode | shipcountry | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 10248 | VINET | 5 | 1996-07-04 | 1996-08-01 | 1996-07-16 | 3 | 32.380001 | Vins et alcools Chevalier | 59 rue de l'Abbaye | Reims | None | 51100 | France |
1 | 10249 | TOMSP | 6 | 1996-07-05 | 1996-08-16 | 1996-07-10 | 1 | 11.610000 | Toms Spezialitäten | Luisenstr. 48 | Münster | None | 44087 | Germany |
Orders['orderdate'].agg([min,max])
min 1996-07-04 max 1998-05-06 Name: orderdate, dtype: datetime64[ns]
o1 = Orders.loc[Orders['orderdate'].dt.year==1997,['orderid','customerid']]
print(o1.shape)
o1.head(2)
(408, 2)
orderid | customerid | |
---|---|---|
152 | 10400 | EASTC |
153 | 10401 | RATTC |
# Look at yugabyte website ER diagram to find columns relation.
# required columns from book
# CustomerID CompanyName OrderID TotalOrderAmount
# Orders Customers Orders CreatedFromOrderDetails
# note: shippers has company name but orders does not have shipper_id.
Customer.head(2)
Id | CompanyName | ContactName | ContactTitle | Address | City | Region | PostalCode | Country | Phone | Fax | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | ALFKI | Alfreds Futterkiste | Maria Anders | Sales Representative | Obere Str. 57 | Berlin | None | 12209 | Germany | 030-0074321 | 030-0076545 |
1 | ANATR | Ana Trujillo Emparedados y helados | Ana Trujillo | Owner | Avda. de la Constitución 2222 | México D.F. | None | 05021 | Mexico | (5) 555-4729 | (5) 555-3745 |
c1 = Customer[['Id','CompanyName']]
c1.head(2)
Id | CompanyName | |
---|---|---|
0 | ALFKI | Alfreds Futterkiste |
1 | ANATR | Ana Trujillo Emparedados y helados |
od1 = OrderDetail[['OrderId','TotalOrderAmount']]
od1 = od1[od1['TotalOrderAmount'] >= 10_000]
od1.head(2)
OrderId | TotalOrderAmount | |
---|---|---|
449 | 10417 | 11188.4 |
450 | 10417 | 11188.4 |
o1.head(2)
orderid | customerid | |
---|---|---|
152 | 10400 | EASTC |
153 | 10401 | RATTC |
tmp = od1.merge(o1,left_on='OrderId',right_on='orderid',suffixes=['_od','_o'])
tmp = tmp.rename(columns={'Id':'Id_od'})
tmp.head(2)
OrderId | TotalOrderAmount | orderid | customerid | |
---|---|---|---|---|
0 | 10417 | 11188.4 | 10417 | SIMOB |
1 | 10417 | 11188.4 | 10417 | SIMOB |
# OrderDetail['OrderAmount'] = ( OrderDetail['UnitPrice']
# - OrderDetail['Discount']
# ) * OrderDetail['Quantity']
# OrderDetail['TotalOrderAmount'] = OrderDetail.groupby('OrderId')\
# ['OrderAmount'].transform('sum')
tmp = tmp.merge(c1,left_on='customerid',right_on='Id')
tmp = tmp.drop(['orderid','Id'],axis=1)
tmp = tmp[['customerid','CompanyName','OrderId','TotalOrderAmount']]
tmp = tmp.drop_duplicates()
tmp = tmp.sort_values('TotalOrderAmount',ascending=False)
tmp
customerid | CompanyName | OrderId | TotalOrderAmount | |
---|---|---|---|---|
0 | SIMOB | Simons bistro | 10417 | 11188.4 |
4 | RATTC | Rattlesnake Canyon Grocery | 10479 | 10495.6 |
8 | QUICK | QUICK-Stop | 10540 | 10191.7 |
12 | QUICK | QUICK-Stop | 10691 | 10164.8 |
o1 = Orders.loc[Orders['orderdate'].dt.year==1997,['orderid','customerid']]
c1 = Customer[['Id','CompanyName']]
od1 = OrderDetail[['OrderId','TotalOrderAmount']]
od1 = od1[od1['TotalOrderAmount'] >= 10_000]
tmp = od1.merge(o1,left_on='OrderId',right_on='orderid',suffixes=['_od','_o'])
tmp = tmp.rename(columns={'Id':'Id_od'})
tmp = tmp.merge(c1,left_on='customerid',right_on='Id')
tmp = tmp.drop(['orderid','Id'],axis=1)
tmp = tmp[['customerid','CompanyName','OrderId','TotalOrderAmount']]
tmp = tmp.drop_duplicates()
tmp = tmp.sort_values('TotalOrderAmount',ascending=False)
del OrderDetail['OrderAmount']
del OrderDetail['TotalOrderAmount']
tmp
customerid | CompanyName | OrderId | TotalOrderAmount | |
---|---|---|---|---|
0 | SIMOB | Simons bistro | 10417 | 11188.4 |
4 | RATTC | Rattlesnake Canyon Grocery | 10479 | 10495.6 |
8 | QUICK | QUICK-Stop | 10540 | 10191.7 |
12 | QUICK | QUICK-Stop | 10691 | 10164.8 |
OrderDetail.head(1)
Id | OrderId | ProductId | UnitPrice | Quantity | Discount | |
---|---|---|---|---|---|---|
0 | 10248/11 | 10248 | 11 | 14.0 | 12 | 0.0 |
%sql SELECT * from Customer limit 1;
Id | CompanyName | ContactName | ContactTitle | Address | City | Region | PostalCode | Country | Phone | Fax | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | ALFKI | Alfreds Futterkiste | Maria Anders | Sales Representative | Obere Str. 57 | Berlin | None | 12209 | Germany | 030-0074321 | 030-0076545 |
%sql SELECT * from Orders limit 1;
orderid | customerid | employeeid | orderdate | requireddate | shippeddate | shipvia | freight | shipname | shipaddress | shipcity | shipregion | shippostalcode | shipcountry | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 10248 | VINET | 5 | 1996-07-04 | 1996-08-01 | 1996-07-16 | 3 | 32.380001 | Vins et alcools Chevalier | 59 rue de l'Abbaye | Reims | None | 51100 | France |
%sql SELECT * from OrderDetail limit 1;
Id | OrderId | ProductId | UnitPrice | Quantity | Discount | |
---|---|---|---|---|---|---|
0 | 10248/11 | 10248 | 11 | 14 | 12 | 0.0 |
%%sql
SELECT c.Id AS CustomerID,
c.CompanyName,
o.OrderID,
sum(od.UnitPrice * od.Quantity * (1 - od.Discount)) AS amount
FROM Customer AS c
INNER JOIN Orders AS o ON c.Id = o.customerid
INNER JOIN OrderDetail AS od ON o.orderid = od.OrderId -- WHERE substr(o.orderdate,1,4) = '1997'
WHERE o.orderdate >= date('1997-01-01')
AND o.orderdate < date('1998-01-01')
GROUP BY c.Id,
c.CompanyName,
o.orderid
HAVING amount >= 10000
ORDER BY amount DESC;
CustomerID | CompanyName | orderid | amount | |
---|---|---|---|---|
0 | SIMOB | Simons bistro | 10417 | 11188.4 |
1 | RATTC | Rattlesnake Canyon Grocery | 10479 | 10495.6 |
2 | QUICK | QUICK-Stop | 10540 | 10191.7 |
3 | QUICK | QUICK-Stop | 10691 | 10164.8 |
"""
# WHERE YEAR(cast(o.orderdate as date)) = 1997 -- postgres has function YEAR.
# WHERE strftime('%y', o.orderdate) = '1997' -- this gives empty result.
# WHERE o.orderdate >= '19970101' and o.orderdate < '19980101' -- gives empty result.
""";
%sql pragma table_info('Orders');
cid | name | type | notnull | dflt_value | pk | |
---|---|---|---|---|---|---|
0 | 0 | orderid | smallint | 1 | None | 0 |
1 | 1 | customerid | bpchar | 0 | None | 0 |
2 | 2 | employeeid | smallint | 0 | None | 0 |
3 | 3 | orderdate | date | 0 | None | 0 |
4 | 4 | requireddate | date | 0 | None | 0 |
5 | 5 | shippeddate | date | 0 | None | 0 |
6 | 6 | shipvia | smallint | 0 | None | 0 |
7 | 7 | freight | real | 0 | None | 0 |
8 | 8 | shipname | character varying(40) | 0 | None | 0 |
9 | 9 | shipaddress | character varying(60) | 0 | None | 0 |
10 | 10 | shipcity | character varying(15) | 0 | None | 0 |
11 | 11 | shipregion | character varying(15) | 0 | None | 0 |
12 | 12 | shippostalcode | character varying(10) | 0 | None | 0 |
13 | 13 | shipcountry | character varying(15) | 0 | None | 0 |
tmp
customerid | CompanyName | OrderId | TotalOrderAmount | |
---|---|---|---|---|
0 | SIMOB | Simons bistro | 10417 | 11188.4 |
4 | RATTC | Rattlesnake Canyon Grocery | 10479 | 10495.6 |
8 | QUICK | QUICK-Stop | 10540 | 10191.7 |
12 | QUICK | QUICK-Stop | 10691 | 10164.8 |
The manager has changed his mind. Instead of
requiring that customers have at least one individual
orders totaling 10,000 or more, he wants to define
high-value customers as those who have orders
totaling 15,000 or more in 1997. How would you
change the answer to the problem above?
Orders.head(1)
orderid | customerid | employeeid | orderdate | requireddate | shippeddate | shipvia | freight | shipname | shipaddress | shipcity | shipregion | shippostalcode | shipcountry | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 10248 | VINET | 5 | 1996-07-04 | 1996-08-01 | 1996-07-16 | 3 | 32.380001 | Vins et alcools Chevalier | 59 rue de l'Abbaye | Reims | None | 51100 | France |
OrderDetail.head(1)
Id | OrderId | ProductId | UnitPrice | Quantity | Discount | |
---|---|---|---|---|---|---|
0 | 10248/11 | 10248 | 11 | 14.0 | 12 | 0.0 |
Customer.head(1)
Id | CompanyName | ContactName | ContactTitle | Address | City | Region | PostalCode | Country | Phone | Fax | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | ALFKI | Alfreds Futterkiste | Maria Anders | Sales Representative | Obere Str. 57 | Berlin | None | 12209 | Germany | 030-0074321 | 030-0076545 |
df = Orders.merge(OrderDetail,
left_on='orderid',right_on='OrderId',
suffixes=['_o','_od'])\
.merge(Customer,
left_on='customerid',right_on='Id',
suffixes=['_o_od','_c'])\
.loc[lambda dfx: dfx['orderdate'].dt.year==1997]\
.assign(OrderAmount =
lambda dfx: ( dfx['UnitPrice']
* dfx['Quantity']
*(1- dfx['Discount'])
))\
.assign(CustomerSum =
lambda dfx: dfx.groupby(['customerid','CompanyName'])
['OrderAmount'].transform('sum')
)\
.loc[lambda dfx: dfx['CustomerSum']>15000]\
.drop_duplicates(['customerid','CompanyName'])\
.sort_values('CustomerSum',ascending=False)\
.reset_index(drop=True)\
[['customerid','CompanyName','CustomerSum']]
print(df.shape)
df
(7, 3)
customerid | CompanyName | CustomerSum | |
---|---|---|---|
0 | QUICK | QUICK-Stop | 61109.9100 |
1 | SAVEA | Save-a-lot Markets | 57713.5750 |
2 | ERNSH | Ernst Handel | 48096.2635 |
3 | MEREP | Mère Paillarde | 23332.3100 |
4 | HUNGO | Hungry Owl All-Night Grocers | 20454.4050 |
5 | RATTC | Rattlesnake Canyon Grocery | 19383.7500 |
6 | SIMOB | Simons bistro | 16232.4125 |
%%sql
SELECT c.Id AS CustomerID,
c.CompanyName,
sum(od.UnitPrice * od.Quantity * (1-od.Discount)) AS CustomerSum
FROM Customer AS c
INNER JOIN Orders AS o ON c.Id = o.customerid
INNER JOIN OrderDetail AS od ON o.orderid = od.OrderId
WHERE substr(o.orderdate, 1, 4) = '1997'
GROUP BY c.Id,
c.CompanyName
HAVING CustomerSum >= 15000
ORDER BY CustomerSum DESC;
CustomerID | CompanyName | CustomerSum | |
---|---|---|---|
0 | QUICK | QUICK-Stop | 61109.9100 |
1 | SAVEA | Save-a-lot Markets | 57713.5750 |
2 | ERNSH | Ernst Handel | 48096.2635 |
3 | MEREP | Mère Paillarde | 23332.3100 |
4 | HUNGO | Hungry Owl All-Night Grocers | 20454.4050 |
5 | RATTC | Rattlesnake Canyon Grocery | 19383.7500 |
6 | SIMOB | Simons bistro | 16232.4125 |
Change the above query to use the discount when calculating high-value customers. Order by the total amount which includes the discount.
# I already included discount.
At the end of the month, salespeople are likely to try much harder to get orders, to meet their month-end quotas. Show all orders made on the last day of the month. Order by EmployeeID and OrderID
# we need EmployeeID OrderID OrderDate
# Only Orders table is needed.
Orders.head(1)
orderid | customerid | employeeid | orderdate | requireddate | shippeddate | shipvia | freight | shipname | shipaddress | shipcity | shipregion | shippostalcode | shipcountry | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 10248 | VINET | 5 | 1996-07-04 | 1996-08-01 | 1996-07-16 | 3 | 32.380001 | Vins et alcools Chevalier | 59 rue de l'Abbaye | Reims | None | 51100 | France |
Orders.groupby([Orders.orderdate.dt.year,Orders.orderdate.dt.month],as_index=False).orderdate.transform('max').head()
orderdate | |
---|---|
0 | 1996-07-31 |
1 | 1996-07-31 |
2 | 1996-07-31 |
3 | 1996-07-31 |
4 | 1996-07-31 |
Orders.groupby([Orders.orderdate.dt.year,Orders.orderdate.dt.month],as_index=False).orderdate.transform('max').tail()
orderdate | |
---|---|
825 | 1998-05-06 |
826 | 1998-05-06 |
827 | 1998-05-06 |
828 | 1998-05-06 |
829 | 1998-05-06 |
(Orders[Orders.orderdate.dt.day
== Orders.orderdate.dt.days_in_month]
.sort_values(['employeeid','orderid'])
[['employeeid','orderid','orderdate']]
.reset_index(drop=True)
)
employeeid | orderid | orderdate | |
---|---|---|---|
0 | 1 | 10461 | 1997-02-28 |
1 | 1 | 10616 | 1997-07-31 |
2 | 2 | 10583 | 1997-06-30 |
3 | 2 | 10686 | 1997-09-30 |
4 | 2 | 10989 | 1998-03-31 |
5 | 2 | 11060 | 1998-04-30 |
6 | 3 | 10432 | 1997-01-31 |
7 | 3 | 10806 | 1997-12-31 |
8 | 3 | 10988 | 1998-03-31 |
9 | 3 | 11063 | 1998-04-30 |
10 | 4 | 10343 | 1996-10-31 |
11 | 4 | 10522 | 1997-04-30 |
12 | 4 | 10584 | 1997-06-30 |
13 | 4 | 10617 | 1997-07-31 |
14 | 4 | 10725 | 1997-10-31 |
15 | 4 | 10807 | 1997-12-31 |
16 | 4 | 11061 | 1998-04-30 |
17 | 4 | 11062 | 1998-04-30 |
18 | 5 | 10269 | 1996-07-31 |
19 | 6 | 10317 | 1996-09-30 |
20 | 7 | 10490 | 1997-03-31 |
21 | 8 | 10399 | 1996-12-31 |
22 | 8 | 10460 | 1997-02-28 |
23 | 8 | 10491 | 1997-03-31 |
24 | 8 | 10987 | 1998-03-31 |
25 | 9 | 10687 | 1997-09-30 |
for_postgres = """
-- works for postgres
SELECT EmployeeID, OrderID, OrderDate
FROM Orders
WHERE DAY(DATE_ADD(OrderDate, INTERVAL 1 DAY)) = 1
ORDER BY EmployeeID, OrderID;
"""
"""
%%sql
-- (sqlite3.OperationalError) near "OrderID": syntax error
-- only sqlite
SELECT EmployeeID,
OrderID,
OrderDate
FROM Orders
WHERE orderdate =
(SELECT date(orderdate, 'start of month', '+1 month', '-1 day'))
ORDER BY EmployeeID,
OrderID;
""";
"""
-- no such function: last_day
SELECT EmployeeID,
OrderID,
OrderDate
FROM Orders
WHERE orderdate = last_day(orderdate)
ORDER BY EmployeeID,
OrderID;
"""
'\n-- no such function: last_day\n\nSELECT EmployeeID,\n OrderID,\n OrderDate\nFROM Orders\nWHERE orderdate = last_day(orderdate)\nORDER BY EmployeeID,\n OrderID;\n'
The Northwind mobile app developers are testing an app that customers will use to show orders. In order to make sure that even the largest orders will show up correctly on the app, they'd like some samples of orders that have lots of individual line items. Show the 10 orders with the most line items, in order of total line items.
Orders.head(1)
orderid | customerid | employeeid | orderdate | requireddate | shippeddate | shipvia | freight | shipname | shipaddress | shipcity | shipregion | shippostalcode | shipcountry | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 10248 | VINET | 5 | 1996-07-04 | 1996-08-01 | 1996-07-16 | 3 | 32.380001 | Vins et alcools Chevalier | 59 rue de l'Abbaye | Reims | None | 51100 | France |
OrderDetail.head(1)
Id | OrderId | ProductId | UnitPrice | Quantity | Discount | |
---|---|---|---|---|---|---|
0 | 10248/11 | 10248 | 11 | 14.0 | 12 | 0.0 |
OrderDetail.groupby('OrderId')['Id'].count().nlargest(10)
OrderId 11077 25 10657 6 10847 6 10979 6 10273 5 10294 5 10309 5 10324 5 10325 5 10337 5 Name: Id, dtype: int64
%sql select * from OrderDetail limit 1;
Id | OrderId | ProductId | UnitPrice | Quantity | Discount | |
---|---|---|---|---|---|---|
0 | 10248/11 | 10248 | 11 | 14 | 12 | 0.0 |
%%sql
SELECT OrderId,
count(*) AS TotalOrderDetail
FROM OrderDetail
GROUP BY orderid
ORDER BY TotalOrderDetail DESC
LIMIT 10;
OrderId | TotalOrderDetail | |
---|---|---|
0 | 11077 | 25 |
1 | 10979 | 6 |
2 | 10847 | 6 |
3 | 10657 | 6 |
4 | 11064 | 5 |
5 | 11031 | 5 |
6 | 11021 | 5 |
7 | 10962 | 5 |
8 | 10893 | 5 |
9 | 10861 | 5 |
# spark.sql(q).show()
sOrderDetail.groupby('OrderId').agg({'Id':'count'}).orderBy('count(Id)',ascending=False).limit(10).show()
+-------+---------+ |OrderId|count(Id)| +-------+---------+ | 11077| 25| | 10979| 6| | 10847| 6| | 10657| 6| | 10514| 5| | 10294| 5| | 10962| 5| | 10558| 5| | 10845| 5| | 10458| 5| +-------+---------+
The Northwind mobile app developers would now like to just get a random assortment of orders for beta testing on their app. Show a random set of 2% of all orders.
Orders.shape
(830, 14)
Orders.sample(frac=0.02,random_state=100).shape
(17, 14)
%%sql
SELECT *
FROM Orders
ORDER BY RANDOM()
LIMIT 2;
orderid | customerid | employeeid | orderdate | requireddate | shippeddate | shipvia | freight | shipname | shipaddress | shipcity | shipregion | shippostalcode | shipcountry | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 10927 | LACOR | 4 | 1998-03-05 | 1998-04-02 | 1998-04-08 | 1 | 19.790001 | La corne d'abondance | 67, avenue de l'Europe | Versailles | None | 78000 | France |
1 | 10293 | TORTU | 1 | 1996-08-29 | 1996-09-26 | 1996-09-11 | 3 | 21.180000 | Tortuga Restaurante | Avda. Azteca 123 | México D.F. | None | 05033 | Mexico |
%%sql
SELECT cast(0.2 * count(*) AS int)
FROM Orders;
cast(0.2 * count(*) AS int) | |
---|---|
0 | 166 |
%%sql
SELECT *
FROM
(SELECT *
FROM Orders
ORDER BY RANDOM()
LIMIT
(SELECT cast(0.2 * count(*) AS int)
FROM Orders))
LIMIT 2;
orderid | customerid | employeeid | orderdate | requireddate | shippeddate | shipvia | freight | shipname | shipaddress | shipcity | shipregion | shippostalcode | shipcountry | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 10603 | SAVEA | 8 | 1997-07-18 | 1997-08-15 | 1997-08-08 | 2 | 48.770001 | Save-a-lot Markets | 187 Suffolk Ln. | Boise | ID | 83720 | USA |
1 | 10757 | SAVEA | 6 | 1997-11-27 | 1997-12-25 | 1997-12-15 | 1 | 8.190000 | Save-a-lot Markets | 187 Suffolk Ln. | Boise | ID | 83720 | USA |
# ms sql server
q_ms_sql = """
Select top 2 percent
OrderID
From Orders
Order By NewID()
""";
sOrders.sample(fraction=0.2).show(2)
+-------+----------+----------+----------+------------+-----------+-------+----------+--------------+---------------+--------------+----------+--------------+-----------+ |orderid|customerid|employeeid| orderdate|requireddate|shippeddate|shipvia| freight| shipname| shipaddress| shipcity|shipregion|shippostalcode|shipcountry| +-------+----------+----------+----------+------------+-----------+-------+----------+--------------+---------------+--------------+----------+--------------+-----------+ | 10250| HANAR| 4|1996-07-08| 1996-08-05| 1996-07-12| 2|65.8300018| Hanari Carnes|Rua do Paço, 67|Rio de Janeiro| RJ| 05454-876| Brazil| | 10266| WARTH| 3|1996-07-26| 1996-09-06| 1996-07-31| 3|25.7299995|Wartian Herkku| Torikatu 38| Oulu| null| 90110| Finland| +-------+----------+----------+----------+------------+-----------+-------+----------+--------------+---------------+--------------+----------+--------------+-----------+ only showing top 2 rows
# spark.sql(q) # function RANDOM() is not in pyspark sql
# %%sql
# SELECT orderid,
# customerid
# FROM Orders tablesample(2 percent);
Janet Leverling, one of the salespeople, has come to you with a request. She thinks that she accidentally double-entered a line item on an order, with a different ProductID, but the same quantity. She remembers that the quantity was 60 or more. Show all the OrderIDs with line items that match this, in order of OrderID.
OrderDetail.head(2)
Id | OrderId | ProductId | UnitPrice | Quantity | Discount | |
---|---|---|---|---|---|---|
0 | 10248/11 | 10248 | 11 | 14.0 | 12 | 0.0 |
1 | 10248/42 | 10248 | 42 | 9.8 | 10 | 0.0 |
idx = OrderDetail[OrderDetail.Quantity >= 60]\
.groupby(['OrderId','Quantity'])['Id'].transform('count')\
.loc[lambda x: x>1].index
# OrderDetail.loc[idx]
f = OrderDetail[OrderDetail.Quantity >= 60]\
.groupby(['OrderId','Quantity'])['Id']\
.filter(lambda dfx: len(dfx)>1)
# OrderDetail.loc[f.index]
OrderDetail[OrderDetail.Quantity >= 60]\
.groupby(['OrderId','Quantity'])['Id']\
.filter(lambda dfx: len(dfx)>1)\
.pipe(lambda x: OrderDetail.loc[x.index])\
.drop_duplicates(['OrderId','Quantity'])
Id | OrderId | ProductId | UnitPrice | Quantity | Discount | |
---|---|---|---|---|---|---|
43 | 10263/16 | 10263 | 16 | 13.9 | 60 | 0.25 |
1080 | 10658/40 | 10658 | 40 | 18.4 | 70 | 0.05 |
1916 | 10990/21 | 10990 | 21 | 10.0 | 65 | 0.00 |
2017 | 11030/2 | 11030 | 2 | 19.0 | 100 | 0.25 |
# sql
%%sql
SELECT *
FROM OrderDetail
WHERE Quantity >= 60
GROUP BY orderid,
quantity
HAVING count(*) > 1
ORDER BY orderid;
Id | OrderId | ProductId | UnitPrice | Quantity | Discount | |
---|---|---|---|---|---|---|
0 | 10263/16 | 10263 | 16 | 13.9 | 60 | 0.25 |
1 | 10658/40 | 10658 | 40 | 18.4 | 70 | 0.05 |
2 | 10990/21 | 10990 | 21 | 10.0 | 65 | 0.00 |
3 | 11030/2 | 11030 | 2 | 19.0 | 100 | 0.25 |
spark.sql('select * from OrderDetail limit 1').show()
+--------+-------+---------+---------+--------+--------+ | Id|OrderId|ProductId|UnitPrice|Quantity|Discount| +--------+-------+---------+---------+--------+--------+ |10248/11| 10248| 11| 14.0| 12| 0.0| +--------+-------+---------+---------+--------+--------+
%%sql
SELECT OrderId,
Quantity -- CAVEAT: select * does not work.
FROM OrderDetail
WHERE Quantity >= 60
GROUP BY OrderId,
Quantity
HAVING count(*) > 1
ORDER BY OrderId;
OrderId | Quantity | |
---|---|---|
0 | 10263 | 60 |
1 | 10658 | 70 |
2 | 10990 | 65 |
3 | 11030 | 100 |
Based on the previous question, we now want to show details of the order, for orders that match the above criteria.
OrderDetail.head(2)
# we want all orders from orderid such that this orderid matches above criteria.
Id | OrderId | ProductId | UnitPrice | Quantity | Discount | |
---|---|---|---|---|---|---|
0 | 10248/11 | 10248 | 11 | 14.0 | 12 | 0.0 |
1 | 10248/42 | 10248 | 42 | 9.8 | 10 | 0.0 |
dfx = OrderDetail[OrderDetail.Quantity >= 60]\
.groupby(['OrderId','Quantity'])['Id']\
.filter(lambda dfx: len(dfx)>1)
dfx.index
Int64Index([43, 45, 1080, 1082, 1916, 1918, 2017, 2020], dtype='int64')
OrderDetail.loc[f.index]['OrderId']
43 10263 45 10263 1080 10658 1082 10658 1916 10990 1918 10990 2017 11030 2020 11030 Name: OrderId, dtype: int64
OrderDetail[OrderDetail.Quantity >= 60]\
.groupby(['OrderId','Quantity'])['Id']\
.filter(lambda dfx: len(dfx)>1)\
.pipe(lambda dfx: OrderDetail[
OrderDetail.OrderId.isin(
OrderDetail.loc[dfx.index]['OrderId'])
])\
.reset_index(drop=True)
Id | OrderId | ProductId | UnitPrice | Quantity | Discount | |
---|---|---|---|---|---|---|
0 | 10263/16 | 10263 | 16 | 13.90 | 60 | 0.25 |
1 | 10263/24 | 10263 | 24 | 3.60 | 28 | 0.00 |
2 | 10263/30 | 10263 | 30 | 20.70 | 60 | 0.25 |
3 | 10263/74 | 10263 | 74 | 8.00 | 36 | 0.25 |
4 | 10658/21 | 10658 | 21 | 10.00 | 60 | 0.00 |
5 | 10658/40 | 10658 | 40 | 18.40 | 70 | 0.05 |
6 | 10658/60 | 10658 | 60 | 34.00 | 55 | 0.05 |
7 | 10658/77 | 10658 | 77 | 13.00 | 70 | 0.05 |
8 | 10990/21 | 10990 | 21 | 10.00 | 65 | 0.00 |
9 | 10990/34 | 10990 | 34 | 14.00 | 60 | 0.15 |
10 | 10990/55 | 10990 | 55 | 24.00 | 65 | 0.15 |
11 | 10990/61 | 10990 | 61 | 28.50 | 66 | 0.15 |
12 | 11030/2 | 11030 | 2 | 19.00 | 100 | 0.25 |
13 | 11030/5 | 11030 | 5 | 21.35 | 70 | 0.00 |
14 | 11030/29 | 11030 | 29 | 123.79 | 60 | 0.25 |
15 | 11030/59 | 11030 | 59 | 55.00 | 100 | 0.25 |
%%sql
-- using CTE and subquery
SELECT *
FROM OrderDetail
WHERE OrderID IN
(SELECT DISTINCT OrderID
FROM OrderDetail
WHERE Quantity >= 60
GROUP BY OrderID,
Quantity
HAVING count(*) > 1
ORDER BY OrderID)
ORDER BY Quantity;
Id | OrderId | ProductId | UnitPrice | Quantity | Discount | |
---|---|---|---|---|---|---|
0 | 10263/24 | 10263 | 24 | 3.60 | 28 | 0.00 |
1 | 10263/74 | 10263 | 74 | 8.00 | 36 | 0.25 |
2 | 10658/60 | 10658 | 60 | 34.00 | 55 | 0.05 |
3 | 10263/16 | 10263 | 16 | 13.90 | 60 | 0.25 |
4 | 10263/30 | 10263 | 30 | 20.70 | 60 | 0.25 |
5 | 10658/21 | 10658 | 21 | 10.00 | 60 | 0.00 |
6 | 10990/34 | 10990 | 34 | 14.00 | 60 | 0.15 |
7 | 11030/29 | 11030 | 29 | 123.79 | 60 | 0.25 |
8 | 10990/21 | 10990 | 21 | 10.00 | 65 | 0.00 |
9 | 10990/55 | 10990 | 55 | 24.00 | 65 | 0.15 |
10 | 10990/61 | 10990 | 61 | 28.50 | 66 | 0.15 |
11 | 10658/40 | 10658 | 40 | 18.40 | 70 | 0.05 |
12 | 10658/77 | 10658 | 77 | 13.00 | 70 | 0.05 |
13 | 11030/5 | 11030 | 5 | 21.35 | 70 | 0.00 |
14 | 11030/2 | 11030 | 2 | 19.00 | 100 | 0.25 |
15 | 11030/59 | 11030 | 59 | 55.00 | 100 | 0.25 |
# spark.sql(q).show()
details, derived table Here's another way of getting the same results as in the previous problem, using a derived table instead of a CTE. However, there's a bug in this SQL. It returns 20 rows instead of 16. Correct the SQL. Problem SQL:
Select
OrderDetails.OrderID,
ProductID,
UnitPrice,
Quantity,
Discount
From OrderDetails
Join (
Select
OrderID
From OrderDetails
Where Quantity >= 60
Group By OrderID, Quantity
Having Count(*) > 1
) PotentialProblemOrders
on PotentialProblemOrders.OrderID = OrderDetails.OrderID
Order by OrderID, ProductID
%sql select * from OrderDetail limit 1;
Id | OrderId | ProductId | UnitPrice | Quantity | Discount | |
---|---|---|---|---|---|---|
0 | 10248/11 | 10248 | 11 | 14 | 12 | 0.0 |
%%sql
SELECT od.OrderID,
ProductID,
UnitPrice,
Quantity,
Discount
FROM OrderDetail AS od
JOIN
(SELECT OrderID
FROM OrderDetail
WHERE Quantity >= 60
GROUP BY OrderID,
Quantity
HAVING Count(*) > 1) sub ON sub.OrderID = od.OrderID
ORDER BY od.OrderID,
od.ProductID;
OrderId | ProductId | UnitPrice | Quantity | Discount | |
---|---|---|---|---|---|
0 | 10263 | 16 | 13.90 | 60 | 0.25 |
1 | 10263 | 24 | 3.60 | 28 | 0.00 |
2 | 10263 | 30 | 20.70 | 60 | 0.25 |
3 | 10263 | 74 | 8.00 | 36 | 0.25 |
4 | 10658 | 21 | 10.00 | 60 | 0.00 |
5 | 10658 | 40 | 18.40 | 70 | 0.05 |
6 | 10658 | 60 | 34.00 | 55 | 0.05 |
7 | 10658 | 77 | 13.00 | 70 | 0.05 |
8 | 10990 | 21 | 10.00 | 65 | 0.00 |
9 | 10990 | 34 | 14.00 | 60 | 0.15 |
10 | 10990 | 55 | 24.00 | 65 | 0.15 |
11 | 10990 | 61 | 28.50 | 66 | 0.15 |
12 | 11030 | 2 | 19.00 | 100 | 0.25 |
13 | 11030 | 5 | 21.35 | 70 | 0.00 |
14 | 11030 | 29 | 123.79 | 60 | 0.25 |
15 | 11030 | 59 | 55.00 | 100 | 0.25 |
# spark.sql(q).show()
Some customers are complaining about their orders arriving late. Which orders are late?
Orders.head(1)
orderid | customerid | employeeid | orderdate | requireddate | shippeddate | shipvia | freight | shipname | shipaddress | shipcity | shipregion | shippostalcode | shipcountry | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 10248 | VINET | 5 | 1996-07-04 | 1996-08-01 | 1996-07-16 | 3 | 32.380001 | Vins et alcools Chevalier | 59 rue de l'Abbaye | Reims | None | 51100 | France |
Orders[Orders.shippeddate > Orders.requireddate].head(2)
orderid | customerid | employeeid | orderdate | requireddate | shippeddate | shipvia | freight | shipname | shipaddress | shipcity | shipregion | shippostalcode | shipcountry | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
16 | 10264 | FOLKO | 6 | 1996-07-24 | 1996-08-21 | 1996-08-23 | 3 | 3.67 | Folk och fä HB | Åkergatan 24 | Bräcke | None | S-844 67 | Sweden |
23 | 10271 | SPLIR | 6 | 1996-08-01 | 1996-08-29 | 1996-08-30 | 2 | 4.54 | Split Rail Beer & Ale | P.O. Box 555 | Lander | WY | 82520 | USA |
# pd.read_sql('pragma table_info("Orders")',conn)
# we can see the pragma of date related columns are already datetype
# we dont need to cast them as date.
%%sql
SELECT *
FROM Orders
WHERE shippeddate > requireddate
LIMIT 2;
orderid | customerid | employeeid | orderdate | requireddate | shippeddate | shipvia | freight | shipname | shipaddress | shipcity | shipregion | shippostalcode | shipcountry | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 10264 | FOLKO | 6 | 1996-07-24 | 1996-08-21 | 1996-08-23 | 3 | 3.67 | Folk och fä HB | Åkergatan 24 | Bräcke | None | S-844 67 | Sweden |
1 | 10271 | SPLIR | 6 | 1996-08-01 | 1996-08-29 | 1996-08-30 | 2 | 4.54 | Split Rail Beer & Ale | P.O. Box 555 | Lander | WY | 82520 | USA |
# spark.sql(q).show()
Some salespeople have more orders arriving late than others. Maybe they're not following up on the order process, and need more training. Which salespeople have the most orders arriving late?
Orders[Orders.shippeddate > Orders.requireddate].head(1)
orderid | customerid | employeeid | orderdate | requireddate | shippeddate | shipvia | freight | shipname | shipaddress | shipcity | shipregion | shippostalcode | shipcountry | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
16 | 10264 | FOLKO | 6 | 1996-07-24 | 1996-08-21 | 1996-08-23 | 3 | 3.67 | Folk och fä HB | Åkergatan 24 | Bräcke | None | S-844 67 | Sweden |
Employee.head(1)
Id | LastName | FirstName | Title | TitleOfCourtesy | BirthDate | HireDate | Address | City | Region | PostalCode | Country | HomePhone | Extension | Photo | Notes | ReportsTo | PhotoPath | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | Davolio | Nancy | Sales Representative | Ms. | 1948-12-08 | 1992-05-01 | 507 - 20th Ave. E. Apt. 2A | Seattle | WA | 98122 | USA | (206) 555-9857 | 5467 | None | Education includes a BA in psychology from Col... | 2.0 | http://accweb/emmployees/davolio.bmp |
Orders[Orders.shippeddate > Orders.requireddate]\
.groupby('employeeid')['orderid'].count().rename('TotalLateOrders')\
.sort_values(ascending=False).reset_index()\
.merge(Employee,left_on='employeeid',right_on='Id',suffixes=['_late','_emp'])\
[['employeeid','TotalLateOrders','FirstName','LastName']]
employeeid | TotalLateOrders | FirstName | LastName | |
---|---|---|---|---|
0 | 4 | 10 | Margaret | Peacock |
1 | 3 | 5 | Janet | Leverling |
2 | 9 | 4 | Anne | Dodsworth |
3 | 8 | 4 | Laura | Callahan |
4 | 7 | 4 | Robert | King |
5 | 2 | 4 | Andrew | Fuller |
6 | 6 | 3 | Michael | Suyama |
7 | 1 | 2 | Nancy | Davolio |
8 | 5 | 1 | Steven | Buchanan |
%sql select * from Orders limit 1;
orderid | customerid | employeeid | orderdate | requireddate | shippeddate | shipvia | freight | shipname | shipaddress | shipcity | shipregion | shippostalcode | shipcountry | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 10248 | VINET | 5 | 1996-07-04 | 1996-08-01 | 1996-07-16 | 3 | 32.380001 | Vins et alcools Chevalier | 59 rue de l'Abbaye | Reims | None | 51100 | France |
%sql select * from Employee limit 1;
Id | LastName | FirstName | Title | TitleOfCourtesy | BirthDate | HireDate | Address | City | Region | PostalCode | Country | HomePhone | Extension | Photo | Notes | ReportsTo | PhotoPath | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | Davolio | Nancy | Sales Representative | Ms. | 1948-12-08 | 1992-05-01 | 507 - 20th Ave. E. Apt. 2A | Seattle | WA | 98122 | USA | (206) 555-9857 | 5467 | None | Education includes a BA in psychology from Col... | 2 | http://accweb/emmployees/davolio.bmp |
%%sql
SELECT e.Id,
e.LastName,
count(*) AS TotalLateOrders
FROM Employee e
JOIN Orders o ON o.employeeid = e.id
WHERE o.shippeddate > o.requireddate
GROUP BY o.employeeid,
e.LastName
ORDER BY count(*) DESC;
Id | LastName | TotalLateOrders | |
---|---|---|---|
0 | 4 | Peacock | 10 |
1 | 3 | Leverling | 5 |
2 | 2 | Fuller | 4 |
3 | 7 | King | 4 |
4 | 8 | Callahan | 4 |
5 | 9 | Dodsworth | 4 |
6 | 6 | Suyama | 3 |
7 | 1 | Davolio | 2 |
8 | 5 | Buchanan | 1 |
%%sql
-- # WARNING: cast as date gives WRONG result.
SELECT e.Id,
e.LastName,
count(*) AS TotalLateOrders
FROM Employee e
JOIN Orders o ON o.employeeid = e.id
WHERE cast(o.shippeddate AS date) > cast(o.requireddate AS date)
GROUP BY o.employeeid,
e.LastName
ORDER BY count(*) DESC;
Id | LastName | TotalLateOrders | |
---|---|---|---|
0 | 7 | King | 1 |
%%sql
SELECT shippeddate,
cast(o.shippeddate AS date)
FROM Orders o
LIMIT 2;
shippeddate | cast(o.shippeddate AS date) | |
---|---|---|
0 | 1996-07-16 | 1996 |
1 | 1996-07-10 | 1996 |
Andrew, the VP of sales, has been doing some more thinking some more about the problem of late orders. He realizes that just looking at the number of orders arriving late for each salesperson isn't a good idea. It needs to be compared against the total number of orders per salesperson. Return results like the following:
Expected Result
EmployeeID LastName AllOrders LateOrders
----------- -------------------- ----------- -----------
1 Davolio 123 3
2 Fuller 96 4
3 Leverling 127 5
4 Peacock 156 10
6 Suyama 67 3
7 King 72 4
8 Callahan 104 5
9 Dodsworth 43 5
Orders.head(1)
orderid | customerid | employeeid | orderdate | requireddate | shippeddate | shipvia | freight | shipname | shipaddress | shipcity | shipregion | shippostalcode | shipcountry | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 10248 | VINET | 5 | 1996-07-04 | 1996-08-01 | 1996-07-16 | 3 | 32.380001 | Vins et alcools Chevalier | 59 rue de l'Abbaye | Reims | None | 51100 | France |
Orders[Orders.shippeddate > Orders.requireddate]\
.groupby('employeeid')['orderid'].count()\
.rename('LateOrders').reset_index()
employeeid | LateOrders | |
---|---|---|
0 | 1 | 2 |
1 | 2 | 4 |
2 | 3 | 5 |
3 | 4 | 10 |
4 | 5 | 1 |
5 | 6 | 3 |
6 | 7 | 4 |
7 | 8 | 4 |
8 | 9 | 4 |
Orders.employeeid.value_counts().rename('AllOrders')\
.rename_axis('employeeid').reset_index()
employeeid | AllOrders | |
---|---|---|
0 | 4 | 156 |
1 | 3 | 127 |
2 | 1 | 123 |
3 | 8 | 104 |
4 | 2 | 96 |
5 | 7 | 72 |
6 | 6 | 67 |
7 | 9 | 43 |
8 | 5 | 42 |
Orders[Orders.shippeddate >= Orders.requireddate]\
.groupby('employeeid')['orderid'].count()\
.rename('LateOrders').reset_index()\
.merge(Orders.employeeid.value_counts().rename('AllOrders')
.rename_axis('employeeid').reset_index(),
on='employeeid',suffixes=['_ord','_emp']
)\
.merge(
Employee, left_on='employeeid', right_on='Id',suffixes=['_old','']
)[['employeeid','LastName','AllOrders','LateOrders']]
# note: use >= not >
employeeid | LastName | AllOrders | LateOrders | |
---|---|---|---|---|
0 | 1 | Davolio | 123 | 3 |
1 | 2 | Fuller | 96 | 4 |
2 | 3 | Leverling | 127 | 5 |
3 | 4 | Peacock | 156 | 10 |
4 | 5 | Buchanan | 42 | 1 |
5 | 6 | Suyama | 67 | 3 |
6 | 7 | King | 72 | 4 |
7 | 8 | Callahan | 104 | 5 |
8 | 9 | Dodsworth | 43 | 5 |
Employee.head(1)
Id | LastName | FirstName | Title | TitleOfCourtesy | BirthDate | HireDate | Address | City | Region | PostalCode | Country | HomePhone | Extension | Photo | Notes | ReportsTo | PhotoPath | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | Davolio | Nancy | Sales Representative | Ms. | 1948-12-08 | 1992-05-01 | 507 - 20th Ave. E. Apt. 2A | Seattle | WA | 98122 | USA | (206) 555-9857 | 5467 | None | Education includes a BA in psychology from Col... | 2.0 | http://accweb/emmployees/davolio.bmp |
%sql select * from Employee limit 1;
Id | LastName | FirstName | Title | TitleOfCourtesy | BirthDate | HireDate | Address | City | Region | PostalCode | Country | HomePhone | Extension | Photo | Notes | ReportsTo | PhotoPath | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | Davolio | Nancy | Sales Representative | Ms. | 1948-12-08 | 1992-05-01 | 507 - 20th Ave. E. Apt. 2A | Seattle | WA | 98122 | USA | (206) 555-9857 | 5467 | None | Education includes a BA in psychology from Col... | 2 | http://accweb/emmployees/davolio.bmp |
%sql select * from Orders limit 1;
orderid | customerid | employeeid | orderdate | requireddate | shippeddate | shipvia | freight | shipname | shipaddress | shipcity | shipregion | shippostalcode | shipcountry | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 10248 | VINET | 5 | 1996-07-04 | 1996-08-01 | 1996-07-16 | 3 | 32.380001 | Vins et alcools Chevalier | 59 rue de l'Abbaye | Reims | None | 51100 | France |
%%sql
SELECT e.Id,
e.LastName,
count(*) AS TotalOrders,
late.TotalLateOrders
FROM Employee AS e
JOIN Orders AS o ON o.employeeid = e.Id
JOIN
(SELECT o.EmployeeId,
e.LastName,
count(*) AS TotalLateOrders
FROM Employee AS e
JOIN Orders AS o ON o.EmployeeID = e.ID
WHERE o.shippeddate >= o.requireddate
GROUP BY o.EmployeeId,
e.LastName
ORDER BY count(*) DESC) AS late ON e.Id = late.EmployeeID
GROUP BY e.Id,
e.LastName;
Id | LastName | TotalOrders | TotalLateOrders | |
---|---|---|---|---|
0 | 1 | Davolio | 123 | 3 |
1 | 2 | Fuller | 96 | 4 |
2 | 3 | Leverling | 127 | 5 |
3 | 4 | Peacock | 156 | 10 |
4 | 5 | Buchanan | 42 | 1 |
5 | 6 | Suyama | 67 | 3 |
6 | 7 | King | 72 | 4 |
7 | 8 | Callahan | 104 | 5 |
8 | 9 | Dodsworth | 43 | 5 |
# I am getting different result for Davolio with employee id 1
Orders.head(1)
orderid | customerid | employeeid | orderdate | requireddate | shippeddate | shipvia | freight | shipname | shipaddress | shipcity | shipregion | shippostalcode | shipcountry | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 10248 | VINET | 5 | 1996-07-04 | 1996-08-01 | 1996-07-16 | 3 | 32.380001 | Vins et alcools Chevalier | 59 rue de l'Abbaye | Reims | None | 51100 | France |
Orders[Orders.employeeid==1].shape
(123, 14)
Orders[Orders.employeeid==1].head(2)
orderid | customerid | employeeid | orderdate | requireddate | shippeddate | shipvia | freight | shipname | shipaddress | shipcity | shipregion | shippostalcode | shipcountry | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
10 | 10258 | ERNSH | 1 | 1996-07-17 | 1996-08-14 | 1996-07-23 | 1 | 140.509995 | Ernst Handel | Kirchgasse 6 | Graz | None | 8010 | Austria |
22 | 10270 | WARTH | 1 | 1996-08-01 | 1996-08-29 | 1996-08-02 | 1 | 136.539993 | Wartian Herkku | Torikatu 38 | Oulu | None | 90110 | Finland |
Employee.head(1)
Id | LastName | FirstName | Title | TitleOfCourtesy | BirthDate | HireDate | Address | City | Region | PostalCode | Country | HomePhone | Extension | Photo | Notes | ReportsTo | PhotoPath | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | Davolio | Nancy | Sales Representative | Ms. | 1948-12-08 | 1992-05-01 | 507 - 20th Ave. E. Apt. 2A | Seattle | WA | 98122 | USA | (206) 555-9857 | 5467 | None | Education includes a BA in psychology from Col... | 2.0 | http://accweb/emmployees/davolio.bmp |
Orders.query("employeeid==1 and shippeddate > requireddate")
orderid | customerid | employeeid | orderdate | requireddate | shippeddate | shipvia | freight | shipname | shipaddress | shipcity | shipregion | shippostalcode | shipcountry | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
461 | 10709 | GOURL | 1 | 1997-10-17 | 1997-11-14 | 1997-11-20 | 3 | 210.800003 | Gourmet Lanchonetes | Av. Brasil, 442 | Campinas | SP | 04876-786 | Brazil |
579 | 10827 | BONAP | 1 | 1998-01-12 | 1998-01-26 | 1998-02-06 | 2 | 63.540001 | Bon app' | 12, rue des Bouchers | Marseille | None | 13008 | France |
# I can see employeeid 1 is Daviolio and She has TWO late shipping.
# If I use shippeddate >= requireddate, then our results match.
%%sql
SELECT e.Id,
e.LastName,
count(*) AS TotalOrders,
late.TotalLateOrders
FROM Employee AS e
JOIN Orders AS o ON o.employeeid = e.Id
JOIN
(SELECT o.EmployeeId,
e.LastName,
count(*) AS TotalLateOrders
FROM Employee AS e
JOIN Orders AS o ON o.EmployeeID = e.ID
WHERE o.shippeddate >= o.requireddate
GROUP BY o.EmployeeId,
e.LastName
ORDER BY count(*) DESC) AS late ON e.Id = late.EmployeeID
GROUP BY e.Id,
e.LastName,
late.TotalLateOrders;
Id | LastName | TotalOrders | TotalLateOrders | |
---|---|---|---|---|
0 | 1 | Davolio | 123 | 3 |
1 | 2 | Fuller | 96 | 4 |
2 | 3 | Leverling | 127 | 5 |
3 | 4 | Peacock | 156 | 10 |
4 | 5 | Buchanan | 42 | 1 |
5 | 6 | Suyama | 67 | 3 |
6 | 7 | King | 72 | 4 |
7 | 8 | Callahan | 104 | 5 |
8 | 9 | Dodsworth | 43 | 5 |
error = """
If I do not include late.TotalLateOrders in group by
I get following error:
expression 'late.`TotalLateOrders`'
is neither present in the group by,
nor is it an aggregate function.
Add to group by or wrap in first() (or first_value)
if you don't care which value you get.
"""
Continuing on the answer for above query, let's fix the results for row 5 - Buchanan. He should have a 0 instead of a Null in LateOrders.
%sql select * from Employee limit 1;
Id | LastName | FirstName | Title | TitleOfCourtesy | BirthDate | HireDate | Address | City | Region | PostalCode | Country | HomePhone | Extension | Photo | Notes | ReportsTo | PhotoPath | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | Davolio | Nancy | Sales Representative | Ms. | 1948-12-08 | 1992-05-01 | 507 - 20th Ave. E. Apt. 2A | Seattle | WA | 98122 | USA | (206) 555-9857 | 5467 | None | Education includes a BA in psychology from Col... | 2 | http://accweb/emmployees/davolio.bmp |
%sql select * from Orders limit 1;
orderid | customerid | employeeid | orderdate | requireddate | shippeddate | shipvia | freight | shipname | shipaddress | shipcity | shipregion | shippostalcode | shipcountry | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 10248 | VINET | 5 | 1996-07-04 | 1996-08-01 | 1996-07-16 | 3 | 32.380001 | Vins et alcools Chevalier | 59 rue de l'Abbaye | Reims | None | 51100 | France |
%%sql
SELECT e.Id,
e.LastName,
count(*) AS TotalOrders,
CASE
WHEN late.TotalLateOrders IS NULL THEN 0
ELSE late.TotalLateOrders
END AS TotalLateOrders
FROM Employee AS e
JOIN Orders AS o ON o.EmployeeID = e.Id
LEFT JOIN
(SELECT o.EmployeeID,
e.LastName,
count(*) AS TotalLateOrders
FROM Employee AS e
JOIN Orders AS o ON o.EmployeeID = e.Id
WHERE o.ShippedDate >= o.RequiredDate
GROUP BY o.EmployeeID,
e.LastName
ORDER BY count(*) DESC) AS late ON e.Id = late.EmployeeID
GROUP BY e.Id,
e.LastName;
Id | LastName | TotalOrders | TotalLateOrders | |
---|---|---|---|---|
0 | 1 | Davolio | 123 | 3 |
1 | 2 | Fuller | 96 | 4 |
2 | 3 | Leverling | 127 | 5 |
3 | 4 | Peacock | 156 | 10 |
4 | 5 | Buchanan | 42 | 1 |
5 | 6 | Suyama | 67 | 3 |
6 | 7 | King | 72 | 4 |
7 | 8 | Callahan | 104 | 5 |
8 | 9 | Dodsworth | 43 | 5 |
Now we want to get the percentage of late orders over total orders.
Orders[Orders.shippeddate >= Orders.requireddate]\
.groupby('employeeid')['orderid'].count()\
.rename('LateOrders').reset_index()\
.merge(Orders.employeeid.value_counts().rename('AllOrders')
.rename_axis('employeeid').reset_index(),
on='employeeid',suffixes=['_ord','_emp']
)\
.merge(
Employee, left_on='employeeid', right_on='Id',suffixes=['_old','']
)[['employeeid','LastName','AllOrders','LateOrders']]\
.assign(
PercentLateOrders = lambda dfx: (dfx['LateOrders']/dfx['AllOrders']))
employeeid | LastName | AllOrders | LateOrders | PercentLateOrders | |
---|---|---|---|---|---|
0 | 1 | Davolio | 123 | 3 | 0.024390 |
1 | 2 | Fuller | 96 | 4 | 0.041667 |
2 | 3 | Leverling | 127 | 5 | 0.039370 |
3 | 4 | Peacock | 156 | 10 | 0.064103 |
4 | 5 | Buchanan | 42 | 1 | 0.023810 |
5 | 6 | Suyama | 67 | 3 | 0.044776 |
6 | 7 | King | 72 | 4 | 0.055556 |
7 | 8 | Callahan | 104 | 5 | 0.048077 |
8 | 9 | Dodsworth | 43 | 5 | 0.116279 |
%%sql
SELECT e.Id,
e.LastName,
count(*) AS TotalOrders, -- late.TotalLateOrders,
-- Handle the missing values
CASE
WHEN late.TotalLateOrders IS NULL THEN 0
ELSE late.TotalLateOrders
END AS TotalLateOrders, -- cast(late.TotalLateOrders as double) / cast(count(*) as double) as PercentLateOrders
-- Handle missing values
cast(CASE
WHEN late.TotalLateOrders IS NULL THEN 0
ELSE late.TotalLateOrders
END AS DOUBLE) / cast(count(*) AS DOUBLE) AS PercentLateOrders
FROM Employee AS e
JOIN Orders AS o ON o.employeeid = e.Id
JOIN
(SELECT o.EmployeeId,
e.LastName,
count(*) AS TotalLateOrders
FROM Employee AS e
JOIN Orders AS o ON o.EmployeeID = e.ID
WHERE o.shippeddate >= o.requireddate
GROUP BY o.EmployeeId,
e.LastName
ORDER BY count(*) DESC) AS late ON e.Id = late.EmployeeID
GROUP BY e.Id,
e.LastName;
Id | LastName | TotalOrders | TotalLateOrders | PercentLateOrders | |
---|---|---|---|---|---|
0 | 1 | Davolio | 123 | 3 | 0.024390 |
1 | 2 | Fuller | 96 | 4 | 0.041667 |
2 | 3 | Leverling | 127 | 5 | 0.039370 |
3 | 4 | Peacock | 156 | 10 | 0.064103 |
4 | 5 | Buchanan | 42 | 1 | 0.023810 |
5 | 6 | Suyama | 67 | 3 | 0.044776 |
6 | 7 | King | 72 | 4 | 0.055556 |
7 | 8 | Callahan | 104 | 5 | 0.048077 |
8 | 9 | Dodsworth | 43 | 5 | 0.116279 |
%%sql
SELECT e.Id,
e.LastName,
count(*) AS TotalOrders, -- late.TotalLateOrders,
-- Handle the missing values
CASE
WHEN late.TotalLateOrders IS NULL THEN 0
ELSE late.TotalLateOrders
END AS TotalLateOrders, -- cast(late.TotalLateOrders as double) / cast(count(*) as double) as PercentLateOrders
-- Handle missing values
cast(CASE
WHEN late.TotalLateOrders IS NULL THEN 0
ELSE late.TotalLateOrders
END AS DOUBLE) / cast(count(*) AS DOUBLE) AS PercentLateOrders
FROM Employee AS e
JOIN Orders AS o ON o.employeeid = e.Id
JOIN
(SELECT o.EmployeeId,
e.LastName,
count(*) AS TotalLateOrders
FROM Employee AS e
JOIN Orders AS o ON o.EmployeeID = e.ID
WHERE o.shippeddate >= o.requireddate
GROUP BY o.EmployeeId,
e.LastName
ORDER BY count(*) DESC) AS late ON e.Id = late.EmployeeID
GROUP BY e.Id,
e.LastName,
TotalLateOrders;
Id | LastName | TotalOrders | TotalLateOrders | PercentLateOrders | |
---|---|---|---|---|---|
0 | 1 | Davolio | 123 | 3 | 0.024390 |
1 | 2 | Fuller | 96 | 4 | 0.041667 |
2 | 3 | Leverling | 127 | 5 | 0.039370 |
3 | 4 | Peacock | 156 | 10 | 0.064103 |
4 | 5 | Buchanan | 42 | 1 | 0.023810 |
5 | 6 | Suyama | 67 | 3 | 0.044776 |
6 | 7 | King | 72 | 4 | 0.055556 |
7 | 8 | Callahan | 104 | 5 | 0.048077 |
8 | 9 | Dodsworth | 43 | 5 | 0.116279 |
So now for the PercentageLateOrders, we get a decimal value like we should. But to make the output easier to read, let's cut the PercentLateOrders off at 2 digits to the right of the decimal point.
%%sql
SELECT e.Id,
e.LastName,
count(*) AS TotalOrders, -- late.TotalLateOrders,
-- Handle the missing values
CASE
WHEN late.TotalLateOrders IS NULL THEN 0
ELSE late.TotalLateOrders
END AS TotalLateOrders,
-- cast(late.TotalLateOrders as double) / cast(count(*) as double) as PercentLateOrders
-- Handle missing values
round(cast(CASE
WHEN late.TotalLateOrders IS NULL THEN 0
ELSE late.TotalLateOrders
END AS DOUBLE) / cast(count(*) AS DOUBLE), 2) AS PercentLateOrders
FROM Employee AS e
JOIN Orders AS o ON o.employeeid = e.Id
JOIN
(SELECT o.EmployeeId,
e.LastName,
count(*) AS TotalLateOrders
FROM Employee AS e
JOIN Orders AS o ON o.EmployeeID = e.ID
WHERE o.shippeddate >= o.requireddate
GROUP BY o.EmployeeId,
e.LastName
ORDER BY count(*) DESC) AS late ON e.Id = late.EmployeeID
GROUP BY e.Id,
e.LastName,
TotalLateOrders;
Id | LastName | TotalOrders | TotalLateOrders | PercentLateOrders | |
---|---|---|---|---|---|
0 | 1 | Davolio | 123 | 3 | 0.02 |
1 | 2 | Fuller | 96 | 4 | 0.04 |
2 | 3 | Leverling | 127 | 5 | 0.04 |
3 | 4 | Peacock | 156 | 10 | 0.06 |
4 | 5 | Buchanan | 42 | 1 | 0.02 |
5 | 6 | Suyama | 67 | 3 | 0.04 |
6 | 7 | King | 72 | 4 | 0.06 |
7 | 8 | Callahan | 104 | 5 | 0.05 |
8 | 9 | Dodsworth | 43 | 5 | 0.12 |
Andrew Fuller, the VP of sales at Northwind, would like to do a sales campaign for existing customers. He'd like to categorize customers into groups, based on how much they ordered in 1997. Then, depending on which group the customer is in, he will target the customer with different sales materials. The customer grouping categories are 0 to 1,000, 1,000 to 5,000, 5,000 to 10,000, and over 10,000. A good starting point for this query is the answer from the problem “High-value customers - total orders. We don’t want to show customers who don’t have any orders in 1997. Order the results by CustomerID.
# OrderDetail Orders Employee
# Data only for 1997, at least one order
# make categories
# order by customerid
# CustomerID CompanyName TotalOrderAmount CustomerGroup
# Orders Customers OrderDetail** CreateThis
OrderDetail.head(1)
Id | OrderId | ProductId | UnitPrice | Quantity | Discount | |
---|---|---|---|---|---|---|
0 | 10248/11 | 10248 | 11 | 14.0 | 12 | 0.0 |
Orders.head(1)
orderid | customerid | employeeid | orderdate | requireddate | shippeddate | shipvia | freight | shipname | shipaddress | shipcity | shipregion | shippostalcode | shipcountry | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 10248 | VINET | 5 | 1996-07-04 | 1996-08-01 | 1996-07-16 | 3 | 32.380001 | Vins et alcools Chevalier | 59 rue de l'Abbaye | Reims | None | 51100 | France |
Customer.head(1)
Id | CompanyName | ContactName | ContactTitle | Address | City | Region | PostalCode | Country | Phone | Fax | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | ALFKI | Alfreds Futterkiste | Maria Anders | Sales Representative | Obere Str. 57 | Berlin | None | 12209 | Germany | 030-0074321 | 030-0076545 |
( OrderDetail.drop(['Id','ProductId'],axis=1)
# create new column
.assign(OrderPrice = lambda x: x['UnitPrice'] * x['Quantity'] * (1-x['Discount']))
# drop unwanted columns
.drop(['UnitPrice', 'Quantity', 'Discount'],axis=1)
# merge with Orders only in 1997
.merge(
Orders[['orderid','customerid','orderdate']
].loc[lambda x: x.orderdate.dt.year==1997],
left_on='OrderId',right_on='orderid',suffixes=['_od','']
)
# drop unwanted columns
.drop(['OrderId','orderid','orderdate'],axis=1)
# merge with Customer
.merge(
Customer[['Id','CompanyName']],
left_on='customerid',right_on='Id',suffixes=['_old','']
)
# drop duplicate column Id
.drop(['Id'],axis=1)
# find sum of orders for given customer
.assign(
TotalOrderAmount = lambda dfx:
dfx.groupby('customerid')['OrderPrice'].transform('sum')
)
# drop unwanted column
.drop('OrderPrice',1)
# assign cut labels
.assign(
CustomerGroup = lambda dfx: pd.cut(dfx['TotalOrderAmount'],
bins=[0,1000,5000, 10000, np.inf],
labels=['Low','Medium','High','VeryHigh']
)
)
# drop duplicates
.drop_duplicates('customerid')
# order by customerid
.sort_values('customerid')
# reset
.reset_index(drop=True)
).pipe(lambda x: [print(x.shape), display(x.head())])
(86, 4)
customerid | CompanyName | TotalOrderAmount | CustomerGroup | |
---|---|---|---|---|
0 | ALFKI | Alfreds Futterkiste | 2022.5000 | Medium |
1 | ANATR | Ana Trujillo Emparedados y helados | 799.7500 | Low |
2 | ANTON | Antonio Moreno Taquería | 5960.7775 | High |
3 | AROUT | Around the Horn | 6406.9000 | High |
4 | BERGS | Berglunds snabbköp | 13849.0150 | VeryHigh |
[None, None]
%sql select * from Customer limit 1;
Id | CompanyName | ContactName | ContactTitle | Address | City | Region | PostalCode | Country | Phone | Fax | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | ALFKI | Alfreds Futterkiste | Maria Anders | Sales Representative | Obere Str. 57 | Berlin | None | 12209 | Germany | 030-0074321 | 030-0076545 |
%%sql
SELECT *,
CASE
WHEN TotalOrder BETWEEN 0 AND 1000 THEN 'low'
WHEN TotalOrder BETWEEN 1000 AND 5000 THEN 'medium'
WHEN TotalOrder BETWEEN 5000 AND 10000 THEN 'high'
WHEN TotalOrder >= 10000 THEN 'very high'
END AS CustomerGroup
FROM
(SELECT o.customerid,
c.companyname,
sum(od.unitprice * od.quantity) AS TotalOrder
FROM Orders AS o
JOIN OrderDetail AS od ON o.orderid = od.orderid
JOIN Customer AS c ON o.customerid = c.id
WHERE substr(o.orderdate, 1, 4) = '1997'
GROUP BY o.customerid,
c.companyname
ORDER BY o.customerid,
c.companyname)
LIMIT 2;
customerid | companyname | TotalOrder | CustomerGroup | |
---|---|---|---|---|
0 | ALFKI | Alfreds Futterkiste | 2294.00 | medium |
1 | ANATR | Ana Trujillo Emparedados y helados | 799.75 | low |
# spark.sql(q).show(3)
There's a bug with the answer for the previous question. The CustomerGroup value for one of the rows is null. Fix the SQL so that there are no nulls in the CustomerGroup field.
# in pandas, nulls are automatically ignored.
# the book says using BETWEEN works good for integers, but
# for floats we should use >= and <
%%sql
SELECT *,
CASE
WHEN TotalOrder >=0
AND TotalOrder<1000 THEN 'low'
WHEN TotalOrder >=1000
AND TotalOrder<5000 THEN 'medium'
WHEN TotalOrder >= 5000
AND TotalOrder<10000 THEN 'high'
WHEN TotalOrder >= 10000 THEN 'very high'
END AS CustomerGroup
FROM
(SELECT o.customerid,
c.companyname,
sum(od.unitprice * od.quantity) AS TotalOrder
FROM Orders AS o
JOIN OrderDetail AS od ON o.orderid = od.orderid
JOIN Customer AS c ON o.customerid = c.id
WHERE substr(o.orderdate, 1, 4) = '1997'
GROUP BY o.customerid,
c.companyname
ORDER BY o.customerid,
c.companyname)
LIMIT 2;
customerid | companyname | TotalOrder | CustomerGroup | |
---|---|---|---|---|
0 | ALFKI | Alfreds Futterkiste | 2294.00 | medium |
1 | ANATR | Ana Trujillo Emparedados y helados | 799.75 | low |
Based on the above query, show all the defined CustomerGroups, and the percentage in each. Sort by the total in each group, in descending order.
( OrderDetail.drop(['Id','ProductId'],axis=1)
# create new column
.assign(OrderPrice = lambda x: x['UnitPrice'] * x['Quantity'] * (1-x['Discount']))
# drop unwanted columns
.drop(['UnitPrice', 'Quantity', 'Discount'],axis=1)
# merge with Orders only in 1997
.merge(
Orders[['orderid','customerid','orderdate']
].loc[lambda x: x.orderdate.dt.year==1997],
left_on='OrderId',right_on='orderid',suffixes=['_od','']
)
# drop unwanted columns
.drop(['OrderId','orderid','orderdate'],axis=1)
# merge with Customers
.merge(
Customer[['Id','CompanyName']],
left_on='customerid',right_on='Id',suffixes=['_old','']
)
# drop duplicate column Id
.drop(['Id'],axis=1)
# find sum of orders for given customer
.assign(
TotalOrderAmount = lambda dfx:
dfx.groupby('customerid')['OrderPrice'].transform('sum')
)
# drop unwanted column
.drop('OrderPrice',1)
# assign cut labels
.assign(
CustomerGroup = lambda dfx: pd.cut(dfx['TotalOrderAmount'],
bins=[0,1000,5000, 10000, np.inf],
labels=['Low','Medium','High','VeryHigh']
)
)
# drop duplicates
.drop_duplicates('customerid')
# order by customerid
.sort_values('customerid')
# reset
.reset_index(drop=True)
# groupby customergroup
.groupby('CustomerGroup').agg(
TotalInGroup = ('TotalOrderAmount','size'),
#TotalOrderGroup = ('TotalOrderAmount','sum'),
# this does not work, since it does not give aggregated
# result. therefore, we need to create percentage later.
# A = ('TotalOrderAmount', lambda dfx: dfx/dfx.sum())
# for one group of dfx, we need one sum value.
)
# create percentage separately
.assign(
PercentageInGroup = lambda dfx: dfx['TotalInGroup']
/dfx['TotalInGroup'].sum())
# sort
.sort_values('TotalInGroup',ascending=False)
)
TotalInGroup | PercentageInGroup | |
---|---|---|
CustomerGroup | ||
Medium | 30 | 0.348837 |
High | 23 | 0.267442 |
VeryHigh | 17 | 0.197674 |
Low | 16 | 0.186047 |
%%sql
SELECT o.customerid,
coalesce(sum(od.unitprice * od.quantity), 0) AS total
FROM Orders o
INNER JOIN OrderDetail od ON o.orderid = od.orderid
WHERE substr(o.orderdate, 1, 4) = '1997'
GROUP BY o.customerid
LIMIT 2;
customerid | total | |
---|---|---|
0 | ALFKI | 2294.00 |
1 | ANATR | 799.75 |
%%sql
SELECT *,
CASE
WHEN o7.total >=0
AND o7.total <1000 THEN 'low'
WHEN o7.total >=1000
AND o7.total <5000 THEN 'medium'
WHEN o7.total >= 5000
AND o7.total <10000 THEN 'high'
WHEN o7.total >= 10000 THEN 'very high'
END AS mygroup
FROM
(SELECT o.customerid,
coalesce(sum(od.unitprice * od.quantity), 0) AS total
FROM Orders o
INNER JOIN OrderDetail od ON o.orderid = od.orderid
WHERE substr(o.orderdate, 1, 4) = '1997'
GROUP BY o.customerid) AS o7
LIMIT 2;
customerid | total | mygroup | |
---|---|---|---|
0 | ALFKI | 2294.00 | medium |
1 | ANATR | 799.75 | low |
%%sql
SELECT gc.mygroup, gc.group_total
-- sum(gc.group_total) -- do this once to see the total
,(gc.group_total / 86.0 ) as percentage
-- this gives only one row
-- gc.group_total/ (select sum(gc.group_total))
FROM
(
SELECT cg.mygroup, count(cg.mygroup) as group_total
FROM
(
select *,
case
when o7.total >=0 and o7.total <1000 then 'low'
when o7.total >=1000 and o7.total <5000 then 'medium'
when o7.total >= 5000 and o7.total <10000 then 'high'
when o7.total >= 10000 then 'very high'
end as mygroup
FROM
(
SELECT
o.customerid,
coalesce(sum(od.unitprice * od.quantity),0) as total
FROM
Orders o
INNER JOIN
OrderDetail od
on o.orderid = od.orderid
WHERE substr(o.orderdate,1,4) = '1997'
GROUP BY
o.customerid
) as o7
) as cg
GROUP BY cg.mygroup
) as gc
ORDER BY gc.group_total DESC;
mygroup | group_total | percentage | |
---|---|---|---|
0 | medium | 29 | 0.337209 |
1 | high | 23 | 0.267442 |
2 | very high | 18 | 0.209302 |
3 | low | 16 | 0.186047 |
# I dont have extra table needed to solve this qn.
Some Northwind employees are planning a business trip, and would like to visit as many suppliers and customers as possible. For their planning, they’d like to see a list of all countries where suppliers and/or customers are based.
Supplier.head(1)
Id | CompanyName | ContactName | ContactTitle | Address | City | Region | PostalCode | Country | Phone | Fax | HomePage | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | Exotic Liquids | Charlotte Cooper | Purchasing Manager | 49 Gilbert St. | London | None | EC1 4SD | UK | (171) 555-2222 | None | None |
Customer.head(1)
Id | CompanyName | ContactName | ContactTitle | Address | City | Region | PostalCode | Country | Phone | Fax | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | ALFKI | Alfreds Futterkiste | Maria Anders | Sales Representative | Obere Str. 57 | Berlin | None | 12209 | Germany | 030-0074321 | 030-0076545 |
np.union1d(Supplier['Country'],
Customer['Country'])
array(['Argentina', 'Australia', 'Austria', 'Belgium', 'Brazil', 'Canada', 'Denmark', 'Finland', 'France', 'Germany', 'Ireland', 'Italy', 'Japan', 'Mexico', 'Netherlands', 'Norway', 'Poland', 'Portugal', 'Singapore', 'Spain', 'Sweden', 'Switzerland', 'UK', 'USA', 'Venezuela'], dtype=object)
%%sql
SELECT country
FROM Supplier
UNION
SELECT country
FROM Customer;
Country | |
---|---|
0 | Argentina |
1 | Australia |
2 | Austria |
3 | Belgium |
4 | Brazil |
5 | Canada |
6 | Denmark |
7 | Finland |
8 | France |
9 | Germany |
10 | Ireland |
11 | Italy |
12 | Japan |
13 | Mexico |
14 | Netherlands |
15 | Norway |
16 | Poland |
17 | Portugal |
18 | Singapore |
19 | Spain |
20 | Sweden |
21 | Switzerland |
22 | UK |
23 | USA |
24 | Venezuela |
The employees going on the business trip don’t want just a raw list of countries, they want more details. We’d like to see output like the below, in the Expected Results:
SupplierCountry CustomerCountry
--------------- ---------------
NULL Argentina
Australia NULL
Supplier.head(1)
Id | CompanyName | ContactName | ContactTitle | Address | City | Region | PostalCode | Country | Phone | Fax | HomePage | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | Exotic Liquids | Charlotte Cooper | Purchasing Manager | 49 Gilbert St. | London | None | EC1 4SD | UK | (171) 555-2222 | None | None |
Customer.head(1)
Id | CompanyName | ContactName | ContactTitle | Address | City | Region | PostalCode | Country | Phone | Fax | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | ALFKI | Alfreds Futterkiste | Maria Anders | Sales Representative | Obere Str. 57 | Berlin | None | 12209 | Germany | 030-0074321 | 030-0076545 |
pd.concat([Supplier.Country.drop_duplicates(),
Customer.Country.drop_duplicates()]
,ignore_index=True,axis=1)\
.rename(columns={0:'SuppliersCountry',1:'CustomersCountry'})\
.sort_values(['SuppliersCountry','CustomersCountry'])
SuppliersCountry | CustomersCountry | |
---|---|---|
6 | Australia | France |
9 | Brazil | Canada |
24 | Canada | NaN |
20 | Denmark | NaN |
22 | Finland | NaN |
17 | France | NaN |
10 | Germany | NaN |
13 | Italy | Switzerland |
3 | Japan | UK |
21 | Netherlands | NaN |
14 | Norway | Brazil |
19 | Singapore | Austria |
4 | Spain | Sweden |
8 | Sweden | NaN |
0 | UK | Germany |
1 | USA | Mexico |
11 | NaN | Argentina |
49 | NaN | Belgium |
72 | NaN | Denmark |
86 | NaN | Finland |
36 | NaN | Ireland |
26 | NaN | Italy |
69 | NaN | Norway |
90 | NaN | Poland |
27 | NaN | Portugal |
7 | NaN | Spain |
31 | NaN | USA |
32 | NaN | Venezuela |
(Supplier[['Country']]
.drop_duplicates()
.set_index('Country',drop=False)
.sort_index()
.rename(columns={'Country':'SuppliersCountry'})
.rename_axis('id')
.reset_index()
).head(2)
id | SuppliersCountry | |
---|---|---|
0 | Australia | Australia |
1 | Brazil | Brazil |
(pd.merge(
(Supplier[['Country']]
.drop_duplicates()
.set_index('Country',drop=False)
.sort_index()
.rename(columns={'Country':'SuppliersCountry'})
.rename_axis('id')
.reset_index()
),
(Customer[['Country']]
.drop_duplicates()
.set_index('Country',drop=False)
.sort_index()
.rename(columns={'Country':'CustomersCountry'})
.rename_axis('id')
.reset_index()
),
how='outer')
.sort_values('id').drop('id', axis=1)
)
SuppliersCountry | CustomersCountry | |
---|---|---|
16 | NaN | Argentina |
0 | Australia | NaN |
17 | NaN | Austria |
18 | NaN | Belgium |
1 | Brazil | Brazil |
2 | Canada | Canada |
3 | Denmark | Denmark |
4 | Finland | Finland |
5 | France | France |
6 | Germany | Germany |
19 | NaN | Ireland |
7 | Italy | Italy |
8 | Japan | NaN |
20 | NaN | Mexico |
9 | Netherlands | NaN |
10 | Norway | Norway |
21 | NaN | Poland |
22 | NaN | Portugal |
11 | Singapore | NaN |
12 | Spain | Spain |
13 | Sweden | Sweden |
23 | NaN | Switzerland |
14 | UK | UK |
15 | USA | USA |
24 | NaN | Venezuela |
t1 = Supplier.Country.drop_duplicates()
t2 = Customer.Country.drop_duplicates()
tmp1 = pd.DataFrame({'id':t1, 'supplier_country':t1})
tmp2 = pd.DataFrame({'id':t2, 'customer_country':t2})
result = pd.merge(tmp1,tmp2, how='outer')
result = result.sort_values('id').drop('id', axis=1)
result
supplier_country | customer_country | |
---|---|---|
17 | NaN | Argentina |
4 | Australia | NaN |
19 | NaN | Austria |
23 | NaN | Belgium |
6 | Brazil | Brazil |
15 | Canada | Canada |
12 | Denmark | Denmark |
14 | Finland | Finland |
10 | France | France |
7 | Germany | Germany |
22 | NaN | Ireland |
8 | Italy | Italy |
2 | Japan | NaN |
16 | NaN | Mexico |
13 | Netherlands | NaN |
9 | Norway | Norway |
24 | NaN | Poland |
20 | NaN | Portugal |
11 | Singapore | NaN |
3 | Spain | Spain |
5 | Sweden | Sweden |
18 | NaN | Switzerland |
0 | UK | UK |
1 | USA | USA |
21 | NaN | Venezuela |
%%sql
SELECT sp.country AS suplier_country,
cp.country AS customer_country
FROM
(SELECT country
FROM supplier
UNION SELECT country
FROM customer) AS c
LEFT JOIN
(SELECT DISTINCT country
FROM supplier) AS sp ON c.country = sp.country
LEFT JOIN
(SELECT DISTINCT country
FROM customer) AS cp ON c.country = cp.country;
suplier_country | customer_country | |
---|---|---|
0 | None | Argentina |
1 | Australia | None |
2 | None | Austria |
3 | None | Belgium |
4 | Brazil | Brazil |
5 | Canada | Canada |
6 | Denmark | Denmark |
7 | Finland | Finland |
8 | France | France |
9 | Germany | Germany |
10 | None | Ireland |
11 | Italy | Italy |
12 | Japan | None |
13 | None | Mexico |
14 | Netherlands | None |
15 | Norway | Norway |
16 | None | Poland |
17 | None | Portugal |
18 | Singapore | None |
19 | Spain | Spain |
20 | Sweden | Sweden |
21 | None | Switzerland |
22 | UK | UK |
23 | USA | USA |
24 | None | Venezuela |
# spark.sql(q).show() # spark does not respect order of columns
q_ms_sql = """
with countries AS (
SELECT
country
FROM
suppliers
UNION
SELECT
country
FROM
customers
), suppliercountry AS (
SELECT
distinct country
FROM
suppliers
), customercountry AS (
SELECT
distinct country
FROM
customers
)
SELECT
sp.country as suplier_country,
cp.country as customer_country
FROM
countries c
LEFT JOIN
suppliercountry sp on c.country = sp.country
LEFT JOIN
customercountry cp on c.country = cp.country
""";
q_book = """
;With SupplierCountries as
(Select Distinct Country from Suppliers)
,CustomerCountries as
(Select Distinct Country from Customers)
Select
SupplierCountry = SupplierCountries .Country
,CustomerCountry = CustomerCountries .Country
From SupplierCountries
Full Outer Join CustomerCountries
on CustomerCountries.Country = SupplierCountries.Country
""";
q_book_good = """
Select
SupplierCountry = SupplierCountries.Country
,CustomerCountry = CustomerCountries.Country
From
(
Select Distinct Country from Suppliers
) as SupplierCountries
Full Outer Join
(
Select Distinct Country from Customers
) as CustomerCountries
on CustomerCountries.Country = SupplierCountries.Country
""";
The output of the above is improved, but it’s still not ideal What we’d really like to see is the country name, the total suppliers, and the total customers.
Supplier.head(1)
Id | CompanyName | ContactName | ContactTitle | Address | City | Region | PostalCode | Country | Phone | Fax | HomePage | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | Exotic Liquids | Charlotte Cooper | Purchasing Manager | 49 Gilbert St. | London | None | EC1 4SD | UK | (171) 555-2222 | None | None |
Customer.head(1)
Id | CompanyName | ContactName | ContactTitle | Address | City | Region | PostalCode | Country | Phone | Fax | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | ALFKI | Alfreds Futterkiste | Maria Anders | Sales Representative | Obere Str. 57 | Berlin | None | 12209 | Germany | 030-0074321 | 030-0076545 |
t1 = Supplier['Country'].value_counts().rename_axis('Country').rename('TotalSuppliers').reset_index()
t1.head(2)
Country | TotalSuppliers | |
---|---|---|
0 | USA | 4 |
1 | Germany | 3 |
t2 = Customer['Country'].value_counts().rename_axis('Country').rename('TotalCustomers').reset_index()
t2.head(2)
Country | TotalCustomers | |
---|---|---|
0 | USA | 13 |
1 | France | 11 |
pd.merge(t1,t2,how='outer').sort_values('Country').fillna(0).set_index("Country").astype(int)
TotalSuppliers | TotalCustomers | |
---|---|---|
Country | ||
Argentina | 0 | 3 |
Australia | 2 | 0 |
Austria | 0 | 2 |
Belgium | 0 | 2 |
Brazil | 1 | 9 |
Canada | 2 | 3 |
Denmark | 1 | 2 |
Finland | 1 | 2 |
France | 3 | 11 |
Germany | 3 | 11 |
Ireland | 0 | 1 |
Italy | 2 | 3 |
Japan | 2 | 0 |
Mexico | 0 | 5 |
Netherlands | 1 | 0 |
Norway | 1 | 1 |
Poland | 0 | 1 |
Portugal | 0 | 2 |
Singapore | 1 | 0 |
Spain | 1 | 5 |
Sweden | 2 | 2 |
Switzerland | 0 | 2 |
UK | 2 | 7 |
USA | 4 | 13 |
Venezuela | 0 | 4 |
q_ms_sql = """
with countries AS (
SELECT country FROM suppliers
UNION
SELECT country FROM customers
)
SELECT
c.country,
coalesce(count(sp.country),0) as totalsupliers,
coalesce(count(cp.country),0) as totalcustomers
FROM
countries c
LEFT JOIN
supplier sp ON c.country = sp.country
LEFT JOIN
customer cp ON c.country = cp.country
GROUP BY
c.country
""";
%%sql
SELECT c.country,
coalesce(count(sp.country), 0) AS totalsupliers,
coalesce(count(cp.country), 0) AS totalcustomers
FROM
(SELECT country
FROM supplier
UNION SELECT country
FROM customer) AS c
LEFT JOIN supplier sp ON c.country = sp.country
LEFT JOIN customer cp ON c.country = cp.country
GROUP BY c.country;
country | totalsupliers | totalcustomers | |
---|---|---|---|
0 | Argentina | 0 | 3 |
1 | Australia | 2 | 0 |
2 | Austria | 0 | 2 |
3 | Belgium | 0 | 2 |
4 | Brazil | 9 | 9 |
5 | Canada | 6 | 6 |
6 | Denmark | 2 | 2 |
7 | Finland | 2 | 2 |
8 | France | 33 | 33 |
9 | Germany | 33 | 33 |
10 | Ireland | 0 | 1 |
11 | Italy | 6 | 6 |
12 | Japan | 2 | 0 |
13 | Mexico | 0 | 5 |
14 | Netherlands | 1 | 0 |
15 | Norway | 1 | 1 |
16 | Poland | 0 | 1 |
17 | Portugal | 0 | 2 |
18 | Singapore | 1 | 0 |
19 | Spain | 5 | 5 |
20 | Sweden | 4 | 4 |
21 | Switzerland | 0 | 2 |
22 | UK | 14 | 14 |
23 | USA | 52 | 52 |
24 | Venezuela | 0 | 4 |
q_book = """
;With SupplierCountries as
(Select Country , Total = Count(*) from Suppliers group by Country)
,CustomerCountries as
(Select Country , Total = Count(*) from Customers group by Country)
Select
Country = isnull( SupplierCountries.Country,
CustomerCountries.Country)
,TotalSuppliers= isnull(SupplierCountries.Total,0)
,TotalCustomers= isnull(CustomerCountries.Total,0)
From SupplierCountries
Full Outer Join CustomerCountries
on CustomerCountries.Country = SupplierCountries.Country
""";
q_book_good = """
Select
ifnull(SupplierCountries.Country,
CustomerCountries.Country) as Country
,ifnull(SupplierCountries.Total,0) as TotalSuppliers
,ifnull(CustomerCountries.Total,0) as TotalCustomers
From
(
Select Country , Total = Count(*) from Suppliers group by Country
) as SupplierCountries
Full Outer Join
( Select Country , Total = Count(*) from Customers group by Country
) as CustomerCountries
on CustomerCountries.Country = SupplierCountries.Country
""";
# pd.read_sql(q_book_good,conn)
# OperationalError: RIGHT and FULL OUTER JOINs are not currently supported
# this might pass in actual sqlite3 but fails in python module.
Looking at the Orders table—we’d like to show details for each order that was the first in that particular country, ordered by OrderID. So, we need one row per ShipCountry, and CustomerID, OrderID, and OrderDate should be of the first order from that country.
Expected Results
ShipCountry CustomerID OrderID OrderDate
--------------- ---------- ----------- ----------
Argentina OCEAN 10409 2015-01-09
Orders.head(1)
orderid | customerid | employeeid | orderdate | requireddate | shippeddate | shipvia | freight | shipname | shipaddress | shipcity | shipregion | shippostalcode | shipcountry | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 10248 | VINET | 5 | 1996-07-04 | 1996-08-01 | 1996-07-16 | 3 | 32.380001 | Vins et alcools Chevalier | 59 rue de l'Abbaye | Reims | None | 51100 | France |
# for given country, orderdate must be the first.
Orders.groupby('shipcountry').agg(
first_orderdate=('orderdate','first')).head(2)
first_orderdate | |
---|---|
shipcountry | |
Argentina | 1997-01-09 |
Austria | 1996-07-17 |
# for a given day, there can be multiple orders.
(Orders[['shipcountry','customerid', 'orderid', 'orderdate']]
.assign(
first_orderdate =
lambda dfx: dfx.groupby('shipcountry'
)['orderdate'].transform('first')
)
# select only first date
.loc[lambda dfx: dfx.orderdate==dfx.first_orderdate]
# sort
.sort_values(['shipcountry','orderid'])
)
shipcountry | customerid | orderid | orderdate | first_orderdate | |
---|---|---|---|---|---|
161 | Argentina | OCEAN | 10409 | 1997-01-09 | 1997-01-09 |
10 | Austria | ERNSH | 10258 | 1996-07-17 | 1996-07-17 |
4 | Belgium | SUPRD | 10252 | 1996-07-09 | 1996-07-09 |
2 | Brazil | HANAR | 10250 | 1996-07-08 | 1996-07-08 |
84 | Canada | MEREP | 10332 | 1996-10-17 | 1996-10-17 |
93 | Denmark | SIMOB | 10341 | 1996-10-29 | 1996-10-29 |
18 | Finland | WARTH | 10266 | 1996-07-26 | 1996-07-26 |
0 | France | VINET | 10248 | 1996-07-04 | 1996-07-04 |
1 | Germany | TOMSP | 10249 | 1996-07-05 | 1996-07-05 |
50 | Ireland | HUNGO | 10298 | 1996-09-05 | 1996-09-05 |
27 | Italy | MAGAA | 10275 | 1996-08-07 | 1996-08-07 |
11 | Mexico | CENTC | 10259 | 1996-07-18 | 1996-07-18 |
139 | Norway | SANTG | 10387 | 1996-12-18 | 1996-12-18 |
126 | Poland | WOLZA | 10374 | 1996-12-05 | 1996-12-05 |
80 | Portugal | FURIB | 10328 | 1996-10-14 | 1996-10-14 |
33 | Spain | ROMEY | 10281 | 1996-08-14 | 1996-08-14 |
16 | Sweden | FOLKO | 10264 | 1996-07-24 | 1996-07-24 |
6 | Switzerland | CHOPS | 10254 | 1996-07-11 | 1996-07-11 |
41 | UK | BSBEV | 10289 | 1996-08-26 | 1996-08-26 |
14 | USA | RATTC | 10262 | 1996-07-22 | 1996-07-22 |
9 | Venezuela | HILAA | 10257 | 1996-07-16 | 1996-07-16 |
q_ms_sql = """
with ranked_orders AS (
SELECT
shipcountry,
customerid,
orderid,
orderdate,
rank() OVER (PARTITION BY shipcountry ORDER BY orderdate) as ranking
FROM
orders
)
SELECT
shipcountry,
customerid,
orderid,
orderdate
FROM
ranked_orders
WHERE
ranking = 1
ORDER BY
shipcountry
""";
%%sql
SELECT shipcountry,
customerid,
orderid,
orderdate
FROM
(SELECT shipcountry,
customerid,
orderid,
orderdate,
rank() OVER (PARTITION BY shipcountry
ORDER BY orderdate) AS ranking
FROM orders) AS ranked_orders
WHERE ranking = 1
ORDER BY shipcountry;
shipcountry | customerid | orderid | orderdate | |
---|---|---|---|---|
0 | Argentina | OCEAN | 10409 | 1997-01-09 |
1 | Austria | ERNSH | 10258 | 1996-07-17 |
2 | Belgium | SUPRD | 10252 | 1996-07-09 |
3 | Brazil | HANAR | 10250 | 1996-07-08 |
4 | Canada | MEREP | 10332 | 1996-10-17 |
5 | Denmark | SIMOB | 10341 | 1996-10-29 |
6 | Finland | WARTH | 10266 | 1996-07-26 |
7 | France | VINET | 10248 | 1996-07-04 |
8 | Germany | TOMSP | 10249 | 1996-07-05 |
9 | Ireland | HUNGO | 10298 | 1996-09-05 |
10 | Italy | MAGAA | 10275 | 1996-08-07 |
11 | Mexico | CENTC | 10259 | 1996-07-18 |
12 | Norway | SANTG | 10387 | 1996-12-18 |
13 | Poland | WOLZA | 10374 | 1996-12-05 |
14 | Portugal | FURIB | 10328 | 1996-10-14 |
15 | Spain | ROMEY | 10281 | 1996-08-14 |
16 | Sweden | FOLKO | 10264 | 1996-07-24 |
17 | Switzerland | CHOPS | 10254 | 1996-07-11 |
18 | UK | BSBEV | 10289 | 1996-08-26 |
19 | USA | RATTC | 10262 | 1996-07-22 |
20 | Venezuela | HILAA | 10257 | 1996-07-16 |
q_book = """
;with OrdersByCountry as
(
Select
ShipCountry
,CustomerID
,OrderID
,OrderDate = convert(date, OrderDate)
,RowNumberPerCountry =
Row_Number()
over (Partition by ShipCountry Order by ShipCountry, OrderID)
From Orders
)
Select
ShipCountry
,CustomerID
,OrderID
,OrderDate
From OrdersByCountry
Where
RowNumberPerCountry = 1
Order by
ShipCountry
""";
q_book_good = """
Select
ShipCountry
,CustomerID
,OrderID
,OrderDate
From
(
Select
ShipCountry
,CustomerID
,OrderID
,OrderDate
,(Row_Number() over (Partition by ShipCountry
Order by ShipCountry, OrderID))
as RowNumberPerCountry
From Orders
) as OrdersByCountry
Where
RowNumberPerCountry = 1
Order by
ShipCountry
""";
There are some customers for whom freight is a major expense when ordering from Northwind. However, by batching up their orders, and making one larger order instead of multiple smaller orders in a short period of time, they could reduce their freight costs significantly. Show those customers who have made more than 1 order in a 5 day period. The sales people will use this to help customers reduce their costs. Note: There are more than one way of solving this kind of problem. For this problem, we will not be using Window functions.
Expected Result
CustomerID InitialOrderID InitialOrderDate nextid nextDate
NextOrderID NextOrderDate DaysBetween
cid iordid idate nxtid nxtdate daysBetwn
------------------------------------------------
ANTON 10677 2015-09-22 10682 2015-09-25 3
AROUT 10741 2015-11-14 10743 2015-11-17 3
BERGS 10278 2014-08-12 10280 2014-08-14 2
BERGS 10444 2015-02-12 10445 2015-02-13 1
BERGS 10866 2016-02-03 10875 2016-02-06 3
Orders.head(1)
orderid | customerid | employeeid | orderdate | requireddate | shippeddate | shipvia | freight | shipname | shipaddress | shipcity | shipregion | shippostalcode | shipcountry | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 10248 | VINET | 5 | 1996-07-04 | 1996-08-01 | 1996-07-16 | 3 | 32.380001 | Vins et alcools Chevalier | 59 rue de l'Abbaye | Reims | None | 51100 | France |
(Orders
# first sort
.sort_values(['customerid', 'orderdate'])
# new column
.assign(
daysbetween = lambda dfx: dfx['orderdate'].diff().dt.days
)
# join to itself which is shifted by -1
# NOTE: Here old Orders do not have column daysbetween
.join(Orders.groupby('customerid').shift(-1),
lsuffix='_initial',
rsuffix='_next')
# after joining drop unwanted cols
# .drop('daysbetween_initial', axis=1)
# filter
# .query('daysbetween_next <= 5 and daysbetween_next >=0')
).head(1)
orderid_initial | customerid | employeeid_initial | orderdate_initial | requireddate_initial | shippeddate_initial | shipvia_initial | freight_initial | shipname_initial | shipaddress_initial | ... | requireddate_next | shippeddate_next | shipvia_next | freight_next | shipname_next | shipaddress_next | shipcity_next | shipregion_next | shippostalcode_next | shipcountry_next | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
395 | 10643 | ALFKI | 6 | 1997-08-25 | 1997-09-22 | 1997-09-02 | 1 | 29.459999 | Alfreds Futterkiste | Obere Str. 57 | ... | 1997-10-31 | 1997-10-13 | 2.0 | 61.020001 | Alfred's Futterkiste | Obere Str. 57 | Berlin | None | 12209 | Germany |
1 rows × 28 columns
(Orders
# first sort
.sort_values(['customerid', 'orderdate'])
# new column
.assign(
daysbetween = lambda dfx: dfx['orderdate'].diff().dt.days
)
# join to itself which is shifted by -1
# NOTE: We need other dataframe which also has column daysbetween
.pipe(lambda dfx:
dfx.join(dfx.groupby('customerid').shift(-1),
lsuffix='_initial',rsuffix='_next')
)
# after joining drop unwanted cols
.drop('daysbetween_initial', axis=1)
# filter
# .query('daysbetween_next <= 5 and daysbetween_next >=0')
.query(' 0<= daysbetween_next <= 5 ')
).pipe(lambda x: [x.shape, display(x.head())])
orderid_initial | customerid | employeeid_initial | orderdate_initial | requireddate_initial | shippeddate_initial | shipvia_initial | freight_initial | shipname_initial | shipaddress_initial | ... | shippeddate_next | shipvia_next | freight_next | shipname_next | shipaddress_next | shipcity_next | shipregion_next | shippostalcode_next | shipcountry_next | daysbetween_next | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
429 | 10677 | ANTON | 1 | 1997-09-22 | 1997-10-20 | 1997-09-26 | 3 | 4.030000 | Antonio Moreno Taquería | Mataderos 2312 | ... | 1997-10-01 | 2.0 | 36.130001 | Antonio Moreno Taquería | Mataderos 2312 | México D.F. | None | 05023 | Mexico | 3.0 |
493 | 10741 | AROUT | 4 | 1997-11-14 | 1997-11-28 | 1997-11-18 | 3 | 10.960000 | Around the Horn | Brook Farm Stratford St. Mary | ... | 1997-11-21 | 2.0 | 23.719999 | Around the Horn | Brook Farm Stratford St. Mary | Colchester | Essex | CO7 6JX | UK | 3.0 |
30 | 10278 | BERGS | 8 | 1996-08-12 | 1996-09-09 | 1996-08-16 | 2 | 92.690002 | Berglunds snabbköp | Berguvsvägen 8 | ... | 1996-09-12 | 1.0 | 8.980000 | Berglunds snabbköp | Berguvsvägen 8 | Luleå | None | S-958 22 | Sweden | 2.0 |
196 | 10444 | BERGS | 3 | 1997-02-12 | 1997-03-12 | 1997-02-21 | 3 | 3.500000 | Berglunds snabbköp | Berguvsvägen 8 | ... | 1997-02-20 | 1.0 | 9.300000 | Berglunds snabbköp | Berguvsvägen 8 | Luleå | None | S-958 22 | Sweden | 1.0 |
618 | 10866 | BERGS | 5 | 1998-02-03 | 1998-03-03 | 1998-02-12 | 1 | 109.110001 | Berglunds snabbköp | Berguvsvägen 8 | ... | 1998-03-03 | 2.0 | 32.369999 | Berglunds snabbköp | Berguvsvägen 8 | Luleå | None | S-958 22 | Sweden | 3.0 |
5 rows × 28 columns
[(69, 28), None]
# first sort
Orders = Orders.sort_values(['customerid', 'orderdate'])
# new column
Orders['daysbetween'] = Orders['orderdate'].diff().dt.days
# join to itself which is shifted by -1
ans = Orders.join(
Orders.groupby('customerid').shift(-1),
lsuffix='_initial',rsuffix='_next')
# after joining drop unwanted cols
ans = ans.drop('daysbetween_initial', axis=1)
# filter
ans.query('daysbetween_next <= 5 and daysbetween_next >=0').tail(2)
orderid_initial | customerid | employeeid_initial | orderdate_initial | requireddate_initial | shippeddate_initial | shipvia_initial | freight_initial | shipname_initial | shipaddress_initial | ... | shippeddate_next | shipvia_next | freight_next | shipname_next | shipaddress_next | shipcity_next | shipregion_next | shippostalcode_next | shipcountry_next | daysbetween_next | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
445 | 10693 | WHITC | 3 | 1997-10-06 | 1997-10-20 | 1997-10-10 | 3 | 139.339996 | White Clover Markets | 1029 - 12th Ave. S. | ... | 1997-10-14 | 3.0 | 102.550003 | White Clover Markets | 1029 - 12th Ave. S. | Seattle | WA | 98124 | USA | 2.0 |
625 | 10873 | WILMK | 4 | 1998-02-06 | 1998-03-06 | 1998-02-09 | 1 | 0.820000 | Wilman Kala | Keskuskatu 45 | ... | 1998-02-12 | 3.0 | 8.500000 | Wilman Kala | Keskuskatu 45 | Helsinki | None | 21240 | Finland | 4.0 |
2 rows × 28 columns
%%sql
SELECT initial_order.customerid,
initial_order.orderid AS initial_order_id,
initial_order.orderdate AS initial_order_date,
next_order.orderid AS nextorderid,
next_order.orderdate AS nextorderdate,
CAST ((JulianDay(next_order.orderdate)
- JulianDay(initial_order.orderdate)) AS int) AS daysbetween
FROM orders initial_order
INNER JOIN orders next_order ON initial_order.customerid = next_order.customerid
WHERE initial_order.orderid < next_order.orderid
AND (JulianDay(next_order.orderdate)
- JulianDay(initial_order.orderdate)) <= 5
LIMIT 2;
customerid | initial_order_id | initial_order_date | nextorderid | nextorderdate | daysbetween | |
---|---|---|---|---|---|---|
0 | HANAR | 10250 | 1996-07-08 | 10253 | 1996-07-10 | 2 |
1 | BERGS | 10278 | 1996-08-12 | 10280 | 1996-08-14 | 2 |
from datetime import datetime
datetime(1996,10,1) - datetime(1996,9,26)
datetime.timedelta(days=5)
# customer ISLAT has bought at least twice within 5 days of
# 1996-09-26 to 1996-10-01
q_ms_sql = """
Select
InitialOrder.CustomerID
,InitialOrderID = InitialOrder.OrderID
,InitialOrderDate = convert(date, InitialOrder.OrderDate)
,NextOrderID = NextOrder.OrderID
,NextOrderDate = convert(date, NextOrder.OrderDate)
,DaysBetween = datediff(dd, InitialOrder.OrderDate,
NextOrder.OrderDate)
from Orders InitialOrder
join Orders NextOrder
on InitialOrder.CustomerID = NextOrder.CustomerID
where
InitialOrder.OrderID < NextOrder.OrderID
and datediff(dd, InitialOrder.OrderDate, NextOrder.OrderDate) <= 5
Order by
InitialOrder.CustomerID
,InitialOrder.OrderID
""";
%%sql
SELECT initial_order.customerid,
initial_order.orderid AS initial_order_id,
initial_order.orderdate AS initial_order_date,
next_order.orderid AS nextorderid,
next_order.orderdate AS nextorderdate,
CAST ((JulianDay(next_order.orderdate)
- JulianDay(initial_order.orderdate)) AS int) AS daysbetween
FROM orders initial_order
INNER JOIN orders next_order ON initial_order.customerid = next_order.customerid
WHERE initial_order.orderid < next_order.orderid
AND (JulianDay(next_order.orderdate) - JulianDay(initial_order.orderdate)) <= 5
LIMIT 2;
customerid | initial_order_id | initial_order_date | nextorderid | nextorderdate | daysbetween | |
---|---|---|---|---|---|---|
0 | HANAR | 10250 | 1996-07-08 | 10253 | 1996-07-10 | 2 |
1 | BERGS | 10278 | 1996-08-12 | 10280 | 1996-08-14 | 2 |
There’s another way of solving the problem above, using Window functions. We would like to see the following results.
q_ms_sql = """
WITH customer_orders AS (
SELECT
customerid,
orderid as initial_order_id,
orderdate as initial_order_date,
LEAD (o.orderdate, 1) OVER ( PARTITION BY customerid
ORDER BY orderdate) AS next_order_date
FROM
orders o
ORDER BY
o.customerid, o.orderdate
)
SELECT
*,
(co.next_order_date - co.initial_order_date) as daysbetween
FROM
customer_orders co
WHERE
(co.next_order_date - co.initial_order_date) <= 5
""";
%%sql
SELECT *,
cast((JulianDay(co.next_order_date) - JulianDay(co.initial_order_date))
AS int) AS daysbetween
FROM
(SELECT customerid,
orderid AS initial_order_id,
orderdate AS initial_order_date,
LEAD (o.orderdate, 1) OVER (PARTITION BY customerid
ORDER BY orderdate) AS next_order_date
FROM orders o
ORDER BY o.customerid,
o.orderdate) AS co
WHERE (JulianDay(co.next_order_date) - JulianDay(co.initial_order_date)) <= 5
LIMIT 2;
customerid | initial_order_id | initial_order_date | next_order_date | daysbetween | |
---|---|---|---|---|---|
0 | ANTON | 10677 | 1997-09-22 | 1997-09-25 | 3 |
1 | AROUT | 10741 | 1997-11-14 | 1997-11-17 | 3 |
time_taken = time.time() - time_start_notebook
h,m = divmod(time_taken,60*60)
print('Time taken: {:.0f} hr {:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))
Time taken: 0 hr 0 min 26 secs