This tutorial is based on the book "SQL Practice Problems"
import time
time_start_notebook = time.time()
import numpy as np
import pandas as pd
import functools # reduce
import sqlite3
import findspark, pyspark
%load_ext watermark
%watermark -a "Bhishan Poudel" -d -v -m
print()
%watermark -iv
Author: Bhishan Poudel Python implementation: CPython Python version : 3.7.7 IPython version : 7.16.1 Compiler : Clang 4.0.1 (tags/RELEASE_401/final) OS : Darwin Release : 19.6.0 Machine : x86_64 Processor : i386 CPU cores : 4 Architecture: 64bit pyspark : 3.1.2 sqlite3 : 2.6.0 findspark: 1.4.2 numpy : 1.17.5 json : 2.0.9 autopep8 : 1.5.4 pandas : 1.0.5
# pyspark settings
import os
import findspark
HOME = os.path.expanduser('~')
findspark.init(HOME + "/Softwares/Spark/spark-3.1.2-bin-hadoop3.2")
# Set spark environments
os.environ['PYSPARK_PYTHON'] = f'{HOME}/opt/miniconda3/envs/spk/bin/python'
os.environ['PYSPARK_DRIVER_PYTHON'] = f'{HOME}/opt/miniconda3/envs/spk/bin/python'
import pyspark
from pyspark import SparkConf, SparkContext, SQLContext
from pyspark.sql.session import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.functions import col as _col
from pyspark.sql.functions import udf # @udf("integer") def myfunc(x,y): return x - y
from pyspark.sql import functions as F # stddev format_number date_format, dayofyear, when
from pyspark.sql.window import Window
from pyspark.sql.types import (StructField,
StringType, IntegerType,
FloatType,DoubleType,DateType,
StructType)
from pyspark.sql.functions import (mean as _mean, min as _min,
max as _max, avg as _avg,
when as _when
)
spark = SparkSession\
.builder\
.appName("bhishan")\
.getOrCreate()
sc = spark.sparkContext
sqlContext = pyspark.SQLContext(sc) # spark_df = sqlContext.createDataFrame(pandas_df)
sc.setLogLevel("INFO")
print(pyspark.__version__)
3.1.2
!pwd
/Volumes/Media/github/Tutorials_and_Lessons/SQL/c02_Book_sql_practice_problems_57
!ls data
Northwind.png Northwind_small.sqlite orders_data.sql Northwind.sqlite northwind.sql suppliers_products.png
# load the jupyter magic from ipython-sql
%load_ext sql
%config SqlMagic.displaycon=False
%config SqlMagic.feedback=False # no Done, rows affected
%config SqlMagic.autopandas=True
%sql sqlite:////Volumes/Media/github/Tutorials_and_Lessons/SQL/c02_Book_sql_practice_problems_57/data/Northwind.sqlite
con_dict = %sql l / --connections
print(con_dict)
{'sqlite:////Volumes/Media/github/Tutorials_and_Lessons/SQL/c02_Book_sql_practice_problems_57/data/Northwind.sqlite': <sql.connection.Connection object at 0x7fc52b48ad50>}
%%sql
SELECT *
FROM sqlite_master
WHERE TYPE='table';
type | name | tbl_name | rootpage | sql | |
---|---|---|---|---|---|
0 | table | Employee | Employee | 2 | CREATE TABLE "Employee" \n(\n "Id" INTEGER PR... |
1 | table | Category | Category | 3 | CREATE TABLE "Category" \n(\n "Id" INTEGER PR... |
2 | table | Customer | Customer | 4 | CREATE TABLE "Customer" \n(\n "Id" VARCHAR(80... |
3 | table | Shipper | Shipper | 8 | CREATE TABLE "Shipper" \n(\n "Id" INTEGER PRI... |
4 | table | Supplier | Supplier | 9 | CREATE TABLE "Supplier" \n(\n "Id" INTEGER PR... |
5 | table | Product | Product | 12 | CREATE TABLE "Product" \n(\n "Id" INTEGER PRI... |
6 | table | OrderDetail | OrderDetail | 14 | CREATE TABLE "OrderDetail" \n(\n "Id" VARCHAR... |
7 | table | CustomerCustomerDemo | CustomerCustomerDemo | 16 | CREATE TABLE "CustomerCustomerDemo" \n(\n "Id... |
8 | table | CustomerDemographic | CustomerDemographic | 18 | CREATE TABLE "CustomerDemographic" \n(\n "Id"... |
9 | table | Region | Region | 21 | CREATE TABLE "Region" \n(\n "Id" INTEGER PRIM... |
10 | table | Territory | Territory | 22 | CREATE TABLE "Territory" \n(\n "Id" VARCHAR(8... |
11 | table | EmployeeTerritory | EmployeeTerritory | 24 | CREATE TABLE "EmployeeTerritory" \n(\n "Id" V... |
12 | table | Order | Order | 11 | CREATE TABLE "Order" (\n"index" INTEGER,\n "O... |
13 | table | orders | orders | 155 | CREATE TABLE orders (\n orderid smallint NO... |
# %%sql
# SELECT *
# FROM Order
# LIMIT 3;
# the order table is problematic.
c = 'VINET'
%%sql
SELECT *
FROM Orders
WHERE customerid = '{c}'
LIMIT 3;
orderid | customerid | employeeid | orderdate | requireddate | shippeddate | shipvia | freight | shipname | shipaddress | shipcity | shipregion | shippostalcode | shipcountry | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 10248 | VINET | 5 | 1996-07-04 | 1996-08-01 | 1996-07-16 | 3 | 32.380001 | Vins et alcools Chevalier | 59 rue de l'Abbaye | Reims | None | 51100 | France |
1 | 10274 | VINET | 6 | 1996-08-06 | 1996-09-03 | 1996-08-16 | 1 | 6.010000 | Vins et alcools Chevalier | 59 rue de l'Abbaye | Reims | None | 51100 | France |
2 | 10295 | VINET | 2 | 1996-09-02 | 1996-09-30 | 1996-09-10 | 2 | 1.150000 | Vins et alcools Chevalier | 59 rue de l'Abbaye | Reims | None | 51100 | France |
Employee = %sql SELECT * from Employee;
Category = %sql SELECT * from Category;
Customer = %sql SELECT * from Customer;
Shipper = %sql SELECT * from Shipper;
Supplier = %sql SELECT * from Supplier;
# Order = %sql SELECT * from Order;
Orders = %sql SELECT * from Orders;
Product = %sql SELECT * from Product;
OrderDetail = %sql SELECT * from OrderDetail;
CustomerCustomerDemo = %sql SELECT * from CustomerCustomerDemo;
CustomerDemographic = %sql SELECT * from CustomerDemographic;
Region = %sql SELECT * from Region;
Territory = %sql SELECT * from Territory;
EmployeeTerritory = %sql SELECT * from EmployeeTerritory;
dfs_names = [ 'Employee', 'Category','Customer',
'Shipper','Supplier','Orders',
'Product','OrderDetail',
'CustomerCustomerDemo','CustomerDemographic',
'Region', 'Territory', 'EmployeeTerritory']
dfs = [ Employee, Category,Customer,Shipper,Supplier,
Orders, Product,OrderDetail,
CustomerCustomerDemo,CustomerDemographic,
Region, Territory, EmployeeTerritory]
for df_name, df in zip(dfs_names,dfs):
print(df_name, df.shape)
display(df.head(2))
Employee (9, 18)
Id | LastName | FirstName | Title | TitleOfCourtesy | BirthDate | HireDate | Address | City | Region | PostalCode | Country | HomePhone | Extension | Photo | Notes | ReportsTo | PhotoPath | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | Davolio | Nancy | Sales Representative | Ms. | 1948-12-08 | 1992-05-01 | 507 - 20th Ave. E. Apt. 2A | Seattle | WA | 98122 | USA | (206) 555-9857 | 5467 | None | Education includes a BA in psychology from Col... | 2.0 | http://accweb/emmployees/davolio.bmp |
1 | 2 | Fuller | Andrew | Vice President, Sales | Dr. | 1952-02-19 | 1992-08-14 | 908 W. Capital Way | Tacoma | WA | 98401 | USA | (206) 555-9482 | 3457 | None | Andrew received his BTS commercial in 1974 and... | NaN | http://accweb/emmployees/fuller.bmp |
Category (8, 3)
Id | CategoryName | Description | |
---|---|---|---|
0 | 1 | Beverages | Soft drinks, coffees, teas, beers, and ales |
1 | 2 | Condiments | Sweet and savory sauces, relishes, spreads, an... |
Customer (91, 11)
Id | CompanyName | ContactName | ContactTitle | Address | City | Region | PostalCode | Country | Phone | Fax | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | ALFKI | Alfreds Futterkiste | Maria Anders | Sales Representative | Obere Str. 57 | Berlin | None | 12209 | Germany | 030-0074321 | 030-0076545 |
1 | ANATR | Ana Trujillo Emparedados y helados | Ana Trujillo | Owner | Avda. de la Constitución 2222 | México D.F. | None | 05021 | Mexico | (5) 555-4729 | (5) 555-3745 |
Shipper (3, 3)
Id | CompanyName | Phone | |
---|---|---|---|
0 | 1 | Speedy Express | (503) 555-9831 |
1 | 2 | United Package | (503) 555-3199 |
Supplier (29, 12)
Id | CompanyName | ContactName | ContactTitle | Address | City | Region | PostalCode | Country | Phone | Fax | HomePage | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | Exotic Liquids | Charlotte Cooper | Purchasing Manager | 49 Gilbert St. | London | None | EC1 4SD | UK | (171) 555-2222 | None | None |
1 | 2 | New Orleans Cajun Delights | Shelley Burke | Order Administrator | P.O. Box 78934 | New Orleans | LA | 70117 | USA | (100) 555-4822 | None | #CAJUN.HTM# |
Orders (830, 14)
orderid | customerid | employeeid | orderdate | requireddate | shippeddate | shipvia | freight | shipname | shipaddress | shipcity | shipregion | shippostalcode | shipcountry | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 10248 | VINET | 5 | 1996-07-04 | 1996-08-01 | 1996-07-16 | 3 | 32.380001 | Vins et alcools Chevalier | 59 rue de l'Abbaye | Reims | None | 51100 | France |
1 | 10249 | TOMSP | 6 | 1996-07-05 | 1996-08-16 | 1996-07-10 | 1 | 11.610000 | Toms Spezialitäten | Luisenstr. 48 | Münster | None | 44087 | Germany |
Product (77, 10)
Id | ProductName | SupplierId | CategoryId | QuantityPerUnit | UnitPrice | UnitsInStock | UnitsOnOrder | ReorderLevel | Discontinued | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | Chai | 1 | 1 | 10 boxes x 20 bags | 18.0 | 39 | 0 | 10 | 0 |
1 | 2 | Chang | 1 | 1 | 24 - 12 oz bottles | 19.0 | 17 | 40 | 25 | 0 |
OrderDetail (2155, 6)
Id | OrderId | ProductId | UnitPrice | Quantity | Discount | |
---|---|---|---|---|---|---|
0 | 10248/11 | 10248 | 11 | 14.0 | 12 | 0.0 |
1 | 10248/42 | 10248 | 42 | 9.8 | 10 | 0.0 |
CustomerCustomerDemo (0, 0)
CustomerDemographic (0, 0)
Region (4, 2)
Id | RegionDescription | |
---|---|---|
0 | 1 | Eastern |
1 | 2 | Western |
Territory (53, 3)
Id | TerritoryDescription | RegionId | |
---|---|---|---|
0 | 01581 | Westboro | 1 |
1 | 01730 | Bedford | 1 |
EmployeeTerritory (49, 3)
Id | EmployeeId | TerritoryId | |
---|---|---|---|
0 | 1/06897 | 1 | 06897 |
1 | 1/19713 | 1 | 19713 |
def spark_df_from_pandas(pandas_df):
df_dtype = pandas_df.dtypes.astype(str).reset_index()
df_dtype.columns = ['column','dtype']
mapping = {'int64' : 'IntegerType()',
'float64': 'DoubleType()',
'bool' : 'BooleanType()',
'object' : 'StringType()',
'datetime64[ns]': 'DateType()',
}
df_dtype['dtype'] = df_dtype['dtype'].map(mapping)
df_dtype['schema'] = " StructField('" +\
df_dtype['column'] + "'," +\
df_dtype['dtype'] + ",True),"
head = 'StructType([\n'
body = '\n'.join(df_dtype['schema'])
tail = '\n ])'
schema = head + body + tail
spark_df = sqlContext.createDataFrame(pandas_df, eval(schema))
return spark_df
sEmployee = spark_df_from_pandas(Employee)
sEmployee.createOrReplaceTempView("Employee")
sCategory = spark_df_from_pandas(Category)
sCategory.createOrReplaceTempView("Category")
sCustomer = spark_df_from_pandas(Customer)
sCustomer.createOrReplaceTempView("Customer")
sShipper = spark_df_from_pandas(Shipper)
sShipper.createOrReplaceTempView("Shipper")
sSupplier = spark_df_from_pandas(Supplier)
sSupplier.createOrReplaceTempView("Supplier")
sOrders = spark_df_from_pandas(Orders)
sOrders.createOrReplaceTempView("Orders")
sProduct = spark_df_from_pandas(Product)
sProduct.createOrReplaceTempView("Product")
sOrderDetail = spark_df_from_pandas(OrderDetail)
sOrderDetail.createOrReplaceTempView("OrderDetail")
try:
sCustomerCustomerDemo = spark_df_from_pandas(CustomerCustomerDemo)
except:
pass
# IndexError: list index out of range
try:
sCustomerCustomerDemo.createOrReplaceTempView("CustomerCustomerDemo")
except:
pass
try:
sCustomerDemographic = spark_df_from_pandas(CustomerDemographic)
sCustomerDemographic.createOrReplaceTempView("CustomerDemographic")
except:
pass
sRegion = spark_df_from_pandas(Region)
sRegion.createOrReplaceTempView("Region")
sTerritory = spark_df_from_pandas(Territory)
sTerritory.createOrReplaceTempView("Territory")
sEmployeeTerritory = spark_df_from_pandas(EmployeeTerritory)
sEmployeeTerritory.createOrReplaceTempView("EmployeeTerritory")
sOrders.createOrReplaceTempView("Orders") # create Orders table.
sOrders.printSchema()
root |-- orderid: integer (nullable = true) |-- customerid: string (nullable = true) |-- employeeid: integer (nullable = true) |-- orderdate: string (nullable = true) |-- requireddate: string (nullable = true) |-- shippeddate: string (nullable = true) |-- shipvia: integer (nullable = true) |-- freight: double (nullable = true) |-- shipname: string (nullable = true) |-- shipaddress: string (nullable = true) |-- shipcity: string (nullable = true) |-- shipregion: string (nullable = true) |-- shippostalcode: string (nullable = true) |-- shipcountry: string (nullable = true)
# sEmployee.show(2)
# spark.sql('select * from Employee limit 2').show()
For this problem, we’d like to see the total number of products in each category. Sort the results by the total number of products, in descending order.
Category.head(2)
Id | CategoryName | Description | |
---|---|---|---|
0 | 1 | Beverages | Soft drinks, coffees, teas, beers, and ales |
1 | 2 | Condiments | Sweet and savory sauces, relishes, spreads, an... |
Product.head(2)
Id | ProductName | SupplierId | CategoryId | QuantityPerUnit | UnitPrice | UnitsInStock | UnitsOnOrder | ReorderLevel | Discontinued | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | Chai | 1 | 1 | 10 boxes x 20 bags | 18.0 | 39 | 0 | 10 | 0 |
1 | 2 | Chang | 1 | 1 | 24 - 12 oz bottles | 19.0 | 17 | 40 | 25 | 0 |
Product[['CategoryId']]\
.merge(Category[['Id','CategoryName']],
left_on='CategoryId',
right_on='Id')\
['CategoryName'].value_counts(ascending=False)
Confections 13 Seafood 12 Condiments 12 Beverages 12 Dairy Products 10 Grains/Cereals 7 Meat/Poultry 6 Produce 5 Name: CategoryName, dtype: int64
%%sql
SELECT CategoryName,
count(CategoryName)
FROM Product P
INNER JOIN Category C ON P.CategoryId = C.Id
GROUP BY CategoryName
ORDER BY count(CategoryName) DESC;
CategoryName | count(CategoryName) | |
---|---|---|
0 | Confections | 13 |
1 | Seafood | 12 |
2 | Condiments | 12 |
3 | Beverages | 12 |
4 | Dairy Products | 10 |
5 | Grains/Cereals | 7 |
6 | Meat/Poultry | 6 |
7 | Produce | 5 |
q = """
select CategoryName, count(CategoryName)
from Product P inner join Category C
on P.CategoryId = C.Id
group by CategoryName
order by count(CategoryName) desc
"""
spark.sql(q).show()
+--------------+-------------------+ | CategoryName|count(CategoryName)| +--------------+-------------------+ | Confections| 13| | Beverages| 12| | Seafood| 12| | Condiments| 12| |Dairy Products| 10| |Grains/Cereals| 7| | Meat/Poultry| 6| | Produce| 5| +--------------+-------------------+
sProduct.select(['CategoryId'])\
.join(sCategory.withColumnRenamed('Id','CategoryId')\
.select('CategoryId','CategoryName'),
on='CategoryId')\
.groupby('CategoryName')\
.count()\
.orderBy('count',ascending=False)\
.show()
+--------------+-----+ | CategoryName|count| +--------------+-----+ | Confections| 13| | Beverages| 12| | Seafood| 12| | Condiments| 12| |Dairy Products| 10| |Grains/Cereals| 7| | Meat/Poultry| 6| | Produce| 5| +--------------+-----+
In the Customers table, show the total number of customers per Country and City.
Customer.head(2)
Id | CompanyName | ContactName | ContactTitle | Address | City | Region | PostalCode | Country | Phone | Fax | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | ALFKI | Alfreds Futterkiste | Maria Anders | Sales Representative | Obere Str. 57 | Berlin | None | 12209 | Germany | 030-0074321 | 030-0076545 |
1 | ANATR | Ana Trujillo Emparedados y helados | Ana Trujillo | Owner | Avda. de la Constitución 2222 | México D.F. | None | 05021 | Mexico | (5) 555-4729 | (5) 555-3745 |
Customer.groupby(['Country', 'City'])['Id']\
.count().rename('count').reset_index().head(2)
Country | City | count | |
---|---|---|---|
0 | Argentina | Buenos Aires | 3 |
1 | Austria | Graz | 1 |
%%sql
SELECT Country,
City,
count(*) AS COUNT
FROM Customer
GROUP BY Country,
City
LIMIT 2;
Country | City | COUNT | |
---|---|---|---|
0 | Argentina | Buenos Aires | 3 |
1 | Austria | Graz | 1 |
sCustomer.groupby(['Country', 'City']).count().show(2)
+-------+---------+-----+ |Country| City|count| +-------+---------+-----+ |Belgium|Charleroi| 1| |Germany| Münster| 1| +-------+---------+-----+ only showing top 2 rows
What products do we have in our inventory that should be reordered? For now, just use the fields UnitsInStock and ReorderLevel, where UnitsInStock is less than the ReorderLevel, ignoring the fields UnitsOnOrder and Discontinued. Order the results by ProductID.
Product.head(2)
Id | ProductName | SupplierId | CategoryId | QuantityPerUnit | UnitPrice | UnitsInStock | UnitsOnOrder | ReorderLevel | Discontinued | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | Chai | 1 | 1 | 10 boxes x 20 bags | 18.0 | 39 | 0 | 10 | 0 |
1 | 2 | Chang | 1 | 1 | 24 - 12 oz bottles | 19.0 | 17 | 40 | 25 | 0 |
Product[Product.UnitsInStock < Product.ReorderLevel].sort_values('Id')\
[['Id','ProductName','UnitsInStock','ReorderLevel']]
Id | ProductName | UnitsInStock | ReorderLevel | |
---|---|---|---|---|
1 | 2 | Chang | 17 | 25 |
2 | 3 | Aniseed Syrup | 13 | 25 |
10 | 11 | Queso Cabrales | 22 | 30 |
20 | 21 | Sir Rodney's Scones | 3 | 5 |
29 | 30 | Nord-Ost Matjeshering | 10 | 15 |
30 | 31 | Gorgonzola Telino | 0 | 20 |
31 | 32 | Mascarpone Fabioli | 9 | 25 |
36 | 37 | Gravad lax | 11 | 25 |
42 | 43 | Ipoh Coffee | 17 | 25 |
44 | 45 | Rogede sild | 5 | 15 |
47 | 48 | Chocolade | 15 | 25 |
48 | 49 | Maxilaku | 10 | 15 |
55 | 56 | Gnocchi di nonna Alice | 21 | 30 |
63 | 64 | Wimmers gute Semmelknödel | 22 | 30 |
65 | 66 | Louisiana Hot Spiced Okra | 4 | 20 |
67 | 68 | Scottish Longbreads | 6 | 15 |
69 | 70 | Outback Lager | 15 | 30 |
73 | 74 | Longlife Tofu | 4 | 5 |
%%sql
SELECT Id,
ProductName,
UnitsInStock,
ReorderLevel
FROM Product
WHERE UnitsInStock < ReorderLevel
ORDER BY Id;
Id | ProductName | UnitsInStock | ReorderLevel | |
---|---|---|---|---|
0 | 2 | Chang | 17 | 25 |
1 | 3 | Aniseed Syrup | 13 | 25 |
2 | 11 | Queso Cabrales | 22 | 30 |
3 | 21 | Sir Rodney's Scones | 3 | 5 |
4 | 30 | Nord-Ost Matjeshering | 10 | 15 |
5 | 31 | Gorgonzola Telino | 0 | 20 |
6 | 32 | Mascarpone Fabioli | 9 | 25 |
7 | 37 | Gravad lax | 11 | 25 |
8 | 43 | Ipoh Coffee | 17 | 25 |
9 | 45 | Rogede sild | 5 | 15 |
10 | 48 | Chocolade | 15 | 25 |
11 | 49 | Maxilaku | 10 | 15 |
12 | 56 | Gnocchi di nonna Alice | 21 | 30 |
13 | 64 | Wimmers gute Semmelknödel | 22 | 30 |
14 | 66 | Louisiana Hot Spiced Okra | 4 | 20 |
15 | 68 | Scottish Longbreads | 6 | 15 |
16 | 70 | Outback Lager | 15 | 30 |
17 | 74 | Longlife Tofu | 4 | 5 |
# spark.sql(q).show()
sProduct[sProduct.UnitsInStock < sProduct.ReorderLevel].orderBy('Id')\
[['Id','ProductName','UnitsInStock','ReorderLevel']]
DataFrame[Id: int, ProductName: string, UnitsInStock: int, ReorderLevel: int]
Now we need to incorporate these fields— UnitsInStock, UnitsOnOrder, ReorderLevel, Discontinued—into our calculation. We’ll define “products that need reordering” with the following:
Product.head(2)
Id | ProductName | SupplierId | CategoryId | QuantityPerUnit | UnitPrice | UnitsInStock | UnitsOnOrder | ReorderLevel | Discontinued | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | Chai | 1 | 1 | 10 boxes x 20 bags | 18.0 | 39 | 0 | 10 | 0 |
1 | 2 | Chang | 1 | 1 | 24 - 12 oz bottles | 19.0 | 17 | 40 | 25 | 0 |
cond1 = (Product['UnitsInStock'] + Product['UnitsOnOrder']) <= Product['ReorderLevel']
cond2 = Product['Discontinued'] == 0
cond = cond1 & cond2
Product[cond][['UnitsInStock', 'UnitsOnOrder', 'ReorderLevel', 'Discontinued']]
UnitsInStock | UnitsOnOrder | ReorderLevel | Discontinued | |
---|---|---|---|---|
29 | 10 | 0 | 15 | 0 |
69 | 15 | 10 | 30 | 0 |
%%sql
SELECT UnitsInStock,
UnitsOnOrder,
ReorderLevel,
Discontinued
FROM Product
WHERE (UnitsInStock + UnitsOnOrder) <= ReorderLevel
AND Discontinued = 0;
UnitsInStock | UnitsOnOrder | ReorderLevel | Discontinued | |
---|---|---|---|---|
0 | 10 | 0 | 15 | 0 |
1 | 15 | 10 | 30 | 0 |
cond1 = (sProduct['UnitsInStock'] +
sProduct['UnitsOnOrder']
) <= sProduct['ReorderLevel']
cond2 = sProduct['Discontinued'] == 0
cond = cond1 & cond2
sProduct[cond][['UnitsInStock', 'UnitsOnOrder',
'ReorderLevel', 'Discontinued']].show()
+------------+------------+------------+------------+ |UnitsInStock|UnitsOnOrder|ReorderLevel|Discontinued| +------------+------------+------------+------------+ | 10| 0| 15| 0| | 15| 10| 30| 0| +------------+------------+------------+------------+
A salesperson for Northwind is going on a business trip to visit customers, and would like to see a list of all customers, sorted by region, alphabetically. However, he wants the customers with no region (null in the Region field) to be at the end, instead of at the top, where you’d normally find the null values. Within the same region, companies should be sorted by CustomerID.
Customer.head(2)
Id | CompanyName | ContactName | ContactTitle | Address | City | Region | PostalCode | Country | Phone | Fax | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | ALFKI | Alfreds Futterkiste | Maria Anders | Sales Representative | Obere Str. 57 | Berlin | None | 12209 | Germany | 030-0074321 | 030-0076545 |
1 | ANATR | Ana Trujillo Emparedados y helados | Ana Trujillo | Owner | Avda. de la Constitución 2222 | México D.F. | None | 05021 | Mexico | (5) 555-4729 | (5) 555-3745 |
Customer[['Id','CompanyName','Region']].sort_values(['Region','Id']).head(4)
Id | CompanyName | Region | |
---|---|---|---|
54 | OLDWO | Old World Delicatessen | AK |
9 | BOTTM | Bottom-Dollar Markets | BC |
41 | LAUGB | Laughing Bacchus Wine Cellars | BC |
44 | LETSS | Let's Stop N Shop | CA |
%%sql
-- Note: we need two same columns.
SELECT Id,
CompanyName,
Region
FROM Customer
ORDER BY region IS NULL,
region,
id;
-- case when Region is null then 1 else 0 end asc, region asc, Id asc;
sCustomer.select('Id', 'CompanyName', 'Region').orderBy(
sCustomer.Region.asc_nulls_last(), 'Id').show(3)
+-----+--------------------+------+ | Id| CompanyName|Region| +-----+--------------------+------+ |OLDWO|Old World Delicat...| AK| |BOTTM|Bottom-Dollar Mar...| BC| |LAUGB|Laughing Bacchus ...| BC| +-----+--------------------+------+ only showing top 3 rows
Some of the countries we ship to have very high freight charges. We'd like to investigate some more shipping options for our customers, to be able to offer them lower freight charges. Return the three ship countries with the highest average freight overall, in descending order by average freight.
Orders.head(2)
orderid | customerid | employeeid | orderdate | requireddate | shippeddate | shipvia | freight | shipname | shipaddress | shipcity | shipregion | shippostalcode | shipcountry | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 10248 | VINET | 5 | 1996-07-04 | 1996-08-01 | 1996-07-16 | 3 | 32.380001 | Vins et alcools Chevalier | 59 rue de l'Abbaye | Reims | None | 51100 | France |
1 | 10249 | TOMSP | 6 | 1996-07-05 | 1996-08-16 | 1996-07-10 | 1 | 11.610000 | Toms Spezialitäten | Luisenstr. 48 | Münster | None | 44087 | Germany |
# good
Orders.groupby('shipcountry')['freight'].mean().rename('avg_freight').nlargest(3).reset_index()
shipcountry | avg_freight | |
---|---|---|
0 | Austria | 184.787501 |
1 | Ireland | 145.012629 |
2 | USA | 112.879427 |
# good (Named Aggregation pandas 1.0)
Orders.groupby('shipcountry').agg(
avg_freight = ('freight','mean')
).nlargest(3,'avg_freight')
avg_freight | |
---|---|
shipcountry | |
Austria | 184.787501 |
Ireland | 145.012629 |
USA | 112.879427 |
# bad
Orders.groupby('shipcountry')['freight'].mean().sort_values(ascending=False).head(3)
shipcountry Austria 184.787501 Ireland 145.012629 USA 112.879427 Name: freight, dtype: float64
%%sql
select * from Orders limit 2;
orderid | customerid | employeeid | orderdate | requireddate | shippeddate | shipvia | freight | shipname | shipaddress | shipcity | shipregion | shippostalcode | shipcountry | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 10248 | VINET | 5 | 1996-07-04 | 1996-08-01 | 1996-07-16 | 3 | 32.380001 | Vins et alcools Chevalier | 59 rue de l'Abbaye | Reims | None | 51100 | France |
1 | 10249 | TOMSP | 6 | 1996-07-05 | 1996-08-16 | 1996-07-10 | 1 | 11.610000 | Toms Spezialitäten | Luisenstr. 48 | Münster | None | 44087 | Germany |
%%sql
SELECT shipcountry,
avg(freight)
FROM Orders
GROUP BY shipcountry;
shipcountry | avg(freight) | |
---|---|---|
0 | Argentina | 37.411250 |
1 | Austria | 184.787501 |
2 | Belgium | 67.375788 |
3 | Brazil | 58.797470 |
4 | Canada | 73.269666 |
5 | Denmark | 77.566111 |
6 | Finland | 41.404091 |
7 | France | 55.036883 |
8 | Germany | 92.485902 |
9 | Ireland | 145.012629 |
10 | Italy | 30.872857 |
11 | Mexico | 40.099285 |
12 | Norway | 45.916666 |
13 | Poland | 25.105715 |
14 | Portugal | 49.502308 |
15 | Spain | 37.473478 |
16 | Sweden | 87.502703 |
17 | Switzerland | 76.029444 |
18 | UK | 52.754821 |
19 | USA | 112.879427 |
20 | Venezuela | 59.460435 |
%%sql
SELECT shipcountry,
avg(freight)
FROM Orders
GROUP BY shipcountry
ORDER BY avg(freight) DESC
LIMIT 3;
shipcountry | avg(freight) | |
---|---|---|
0 | Austria | 184.787501 |
1 | Ireland | 145.012629 |
2 | USA | 112.879427 |
sOrders.groupby('shipcountry')\
.agg({'freight':'avg'})\
.orderBy('avg(freight)',ascending=False)\
.limit(3)\
.show()
+-----------+------------------+ |shipcountry| avg(freight)| +-----------+------------------+ | Austria| 184.787500619| | Ireland|145.01262900000003| | USA|112.87942658734427| +-----------+------------------+
spark.sql(q.replace('Orders','Order')).show()
+--------------+-------------------+ | CategoryName|count(CategoryName)| +--------------+-------------------+ | Confections| 13| | Seafood| 12| | Beverages| 12| | Condiments| 12| |Dairy Products| 10| |Grains/Cereals| 7| | Meat/Poultry| 6| | Produce| 5| +--------------+-------------------+
We're continuing on the question above on high freight charges. Now, instead of using all the orders we have, we only want to see orders from the year 1997.
Orders.head(2)
orderid | customerid | employeeid | orderdate | requireddate | shippeddate | shipvia | freight | shipname | shipaddress | shipcity | shipregion | shippostalcode | shipcountry | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 10248 | VINET | 5 | 1996-07-04 | 1996-08-01 | 1996-07-16 | 3 | 32.380001 | Vins et alcools Chevalier | 59 rue de l'Abbaye | Reims | None | 51100 | France |
1 | 10249 | TOMSP | 6 | 1996-07-05 | 1996-08-16 | 1996-07-10 | 1 | 11.610000 | Toms Spezialitäten | Luisenstr. 48 | Münster | None | 44087 | Germany |
Orders['orderdate'] = pd.to_datetime(Orders['orderdate'])
Orders['requireddate'] = pd.to_datetime(Orders['requireddate'])
Orders['shippeddate'] = pd.to_datetime(Orders['shippeddate'])
Orders.head(2)
orderid | customerid | employeeid | orderdate | requireddate | shippeddate | shipvia | freight | shipname | shipaddress | shipcity | shipregion | shippostalcode | shipcountry | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 10248 | VINET | 5 | 1996-07-04 | 1996-08-01 | 1996-07-16 | 3 | 32.380001 | Vins et alcools Chevalier | 59 rue de l'Abbaye | Reims | None | 51100 | France |
1 | 10249 | TOMSP | 6 | 1996-07-05 | 1996-08-16 | 1996-07-10 | 1 | 11.610000 | Toms Spezialitäten | Luisenstr. 48 | Münster | None | 44087 | Germany |
Orders.orderdate.apply([min,max])
min 1996-07-04 max 1998-05-06 Name: orderdate, dtype: datetime64[ns]
# I have altered data for Order so I will take 1997 instead of 2015.
Orders[Orders.orderdate >= '1997-01-01'].head(2)
orderid | customerid | employeeid | orderdate | requireddate | shippeddate | shipvia | freight | shipname | shipaddress | shipcity | shipregion | shippostalcode | shipcountry | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
152 | 10400 | EASTC | 1 | 1997-01-01 | 1997-01-29 | 1997-01-16 | 3 | 83.93 | Eastern Connection | 35 King George | London | None | WX3 6FW | UK |
153 | 10401 | RATTC | 1 | 1997-01-01 | 1997-01-29 | 1997-01-10 | 1 | 12.51 | Rattlesnake Canyon Grocery | 2817 Milton Dr. | Albuquerque | NM | 87110 | USA |
df1 = Orders[Orders.orderdate.between('1997-01-01','1998-01-01')]
df1.head(2).append(df1.tail(2))
# between is both inclusive
orderid | customerid | employeeid | orderdate | requireddate | shippeddate | shipvia | freight | shipname | shipaddress | shipcity | shipregion | shippostalcode | shipcountry | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
152 | 10400 | EASTC | 1 | 1997-01-01 | 1997-01-29 | 1997-01-16 | 3 | 83.93 | Eastern Connection | 35 King George | London | None | WX3 6FW | UK |
153 | 10401 | RATTC | 1 | 1997-01-01 | 1997-01-29 | 1997-01-10 | 1 | 12.51 | Rattlesnake Canyon Grocery | 2817 Milton Dr. | Albuquerque | NM | 87110 | USA |
561 | 10809 | WELLI | 7 | 1998-01-01 | 1998-01-29 | 1998-01-07 | 1 | 4.87 | Wellington Importadora | Rua do Mercado, 12 | Resende | SP | 08737-363 | Brazil |
562 | 10810 | LAUGB | 2 | 1998-01-01 | 1998-01-29 | 1998-01-07 | 3 | 4.33 | Laughing Bacchus Wine Cellars | 2319 Elm St. | Vancouver | BC | V3F 2K1 | Canada |
df1 = Orders[ (Orders.orderdate >= '1997-01-01') &
(Orders.orderdate < '1998-01-01')]
df1.head(2).append(df1.tail(2))
orderid | customerid | employeeid | orderdate | requireddate | shippeddate | shipvia | freight | shipname | shipaddress | shipcity | shipregion | shippostalcode | shipcountry | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
152 | 10400 | EASTC | 1 | 1997-01-01 | 1997-01-29 | 1997-01-16 | 3 | 83.930000 | Eastern Connection | 35 King George | London | None | WX3 6FW | UK |
153 | 10401 | RATTC | 1 | 1997-01-01 | 1997-01-29 | 1997-01-10 | 1 | 12.510000 | Rattlesnake Canyon Grocery | 2817 Milton Dr. | Albuquerque | NM | 87110 | USA |
558 | 10806 | VICTE | 3 | 1997-12-31 | 1998-01-28 | 1998-01-05 | 2 | 22.110001 | Victuailles en stock | 2, rue du Commerce | Lyon | None | 69004 | France |
559 | 10807 | FRANS | 4 | 1997-12-31 | 1998-01-28 | 1998-01-30 | 1 | 1.360000 | Franchi S.p.A. | Via Monte Bianco 34 | Torino | None | 10100 | Italy |
df1.groupby('shipcountry')[['freight']].mean().sort_values('freight',ascending=False)
freight | |
---|---|
shipcountry | |
Austria | 178.364286 |
Switzerland | 117.177500 |
Sweden | 105.159999 |
Canada | 99.279999 |
Ireland | 98.076999 |
Germany | 97.383594 |
USA | 96.989501 |
Denmark | 94.622727 |
Venezuela | 71.044500 |
Belgium | 65.782857 |
France | 63.275898 |
Brazil | 53.000238 |
Mexico | 52.959166 |
Poland | 52.220001 |
Finland | 48.931539 |
Spain | 42.419999 |
UK | 39.618333 |
Portugal | 39.212857 |
Italy | 29.378000 |
Norway | 26.005000 |
Argentina | 19.610000 |
Orders[ (Orders.orderdate >= '1997-01-01') &
(Orders.orderdate < '1998-01-01')]\
.groupby('shipcountry')\
.agg(avg_freight = ('freight','mean')
).nlargest(3,'avg_freight')
avg_freight | |
---|---|
shipcountry | |
Austria | 178.364286 |
Switzerland | 117.177500 |
Sweden | 105.159999 |
%%sql
SELECT *
FROM Orders
WHERE orderdate BETWEEN '1997-01-01' AND '1998-01-01'
LIMIT 3;
orderid | customerid | employeeid | orderdate | requireddate | shippeddate | shipvia | freight | shipname | shipaddress | shipcity | shipregion | shippostalcode | shipcountry | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 10400 | EASTC | 1 | 1997-01-01 | 1997-01-29 | 1997-01-16 | 3 | 83.930000 | Eastern Connection | 35 King George | London | None | WX3 6FW | UK |
1 | 10401 | RATTC | 1 | 1997-01-01 | 1997-01-29 | 1997-01-10 | 1 | 12.510000 | Rattlesnake Canyon Grocery | 2817 Milton Dr. | Albuquerque | NM | 87110 | USA |
2 | 10402 | ERNSH | 8 | 1997-01-02 | 1997-02-13 | 1997-01-10 | 2 | 67.879997 | Ernst Handel | Kirchgasse 6 | Graz | None | 8010 | Austria |
%%sql
-- without casting as date
-- between is both end inclusive.
SELECT shipcountry,
avg(freight)
FROM Orders
WHERE orderdate BETWEEN '1997-01-01' AND '1997-12-31'
GROUP BY shipcountry
ORDER BY avg(freight) DESC
LIMIT 3;
shipcountry | avg(freight) | |
---|---|---|
0 | Austria | 178.364286 |
1 | Switzerland | 117.177500 |
2 | Sweden | 105.159999 |
%%sql
-- using date
SELECT shipcountry,
avg(freight)
FROM Orders
WHERE cast(orderdate AS date) >= cast('1997-01-01' AS date)
AND cast(orderdate AS date) < cast('1998-01-01' AS date)
GROUP BY shipcountry
ORDER BY avg(freight) DESC
LIMIT 3;
(sqlite3.OperationalError) near "avg": syntax error [SQL: -- using date SELECT shipcountry, avg(freight) FROM Orders WHERE cast(orderdate AS date) >= cast('1997-01-01' AS date) AND cast(orderdate AS date) < cast('1998-01-01' AS date) GROUP BY shipcountry ORDER BY avg(freight) DESC LIMIT 3;] (Background on this error at: http://sqlalche.me/e/13/e3q8)
sOrders\
[ (sOrders.orderdate >= '1997-01-01') &
(sOrders.orderdate < '1998-01-01')]\
.groupby('shipcountry')\
.agg({'freight':'avg'})\
.orderBy('avg(freight)',ascending=False)\
.limit(3)\
.show()
+-----------+------------------+ |shipcountry| avg(freight)| +-----------+------------------+ | Austria|178.36428571714285| |Switzerland| 117.177499675| | Sweden|105.15999935000002| +-----------+------------------+
# This question simply asks not to use 'between' operator since
# it is both inclusive.
We're continuing to work on high freight charges. We now want to get the three ship countries with the highest average freight charges. But instead of filtering for a particular year, we want to use the last 12 months of order data, using as the end date the last OrderDate in Orders.
# we only care sorted data last 12 months.
Orders.head(2)
orderid | customerid | employeeid | orderdate | requireddate | shippeddate | shipvia | freight | shipname | shipaddress | shipcity | shipregion | shippostalcode | shipcountry | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 10248 | VINET | 5 | 1996-07-04 | 1996-08-01 | 1996-07-16 | 3 | 32.380001 | Vins et alcools Chevalier | 59 rue de l'Abbaye | Reims | None | 51100 | France |
1 | 10249 | TOMSP | 6 | 1996-07-05 | 1996-08-16 | 1996-07-10 | 1 | 11.610000 | Toms Spezialitäten | Luisenstr. 48 | Münster | None | 44087 | Germany |
Orders.orderdate.max()
Timestamp('1998-05-06 00:00:00')
Orders.orderdate.max() - pd.Timedelta('12 M') # 12 minutes before
Timestamp('1998-05-05 23:48:00')
Orders.orderdate.max() - np.timedelta64(12,'M')
Timestamp('1997-05-05 18:10:48')
Orders.orderdate.max() - pd.DateOffset(months=12)
Timestamp('1997-05-06 00:00:00')
pd.to_datetime('2020-12-31') - pd.DateOffset(months=12)
Timestamp('2019-12-31 00:00:00')
cond1 = Orders.orderdate <= Orders.orderdate.max()
cond2 = Orders.orderdate > (Orders.orderdate.max() -
pd.DateOffset(months=12))
cond = cond1 & cond2
df1 = Orders[cond]
df1.head(2)
orderid | customerid | employeeid | orderdate | requireddate | shippeddate | shipvia | freight | shipname | shipaddress | shipcity | shipregion | shippostalcode | shipcountry | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
281 | 10529 | MAISD | 5 | 1997-05-07 | 1997-06-04 | 1997-05-09 | 2 | 66.690002 | Maison Dewey | Rue Joseph-Bens 532 | Bruxelles | None | B-1180 | Belgium |
282 | 10530 | PICCO | 3 | 1997-05-08 | 1997-06-05 | 1997-05-12 | 2 | 339.220001 | Piccolo und mehr | Geislweg 14 | Salzburg | None | 5020 | Austria |
# Orders[Orders.orderdate == Orders.orderdate.max()].head(2)
Orders[cond]\
.groupby('shipcountry')\
.agg(avg_freight = ('freight','mean')
).nlargest(3,'avg_freight')
avg_freight | |
---|---|
shipcountry | |
Ireland | 200.209995 |
Austria | 186.459601 |
USA | 119.303256 |
%%sql
SELECT date(orderdate)
FROM Orders
LIMIT 3;
date(orderdate) | |
---|---|
0 | 1996-07-04 |
1 | 1996-07-05 |
2 | 1996-07-08 |
%%sql
SELECT date(orderdate, '-12 month')
FROM Orders
LIMIT 3;
date(orderdate, '-12 month') | |
---|---|
0 | 1995-07-04 |
1 | 1995-07-05 |
2 | 1995-07-08 |
%%sql
SELECT max(orderdate)
FROM Orders
LIMIT 3;
max(orderdate) | |
---|---|
0 | 1998-05-06 |
Orders.orderdate.max()
Timestamp('1998-05-06 00:00:00')
%%sql
SELECT date(max(orderdate), '-12 month')
FROM Orders
LIMIT 3;
date(max(orderdate), '-12 month') | |
---|---|
0 | 1997-05-06 |
%%sql
SELECT shipcountry,
round(avg(freight), 2)
FROM Orders
WHERE date(orderdate) >=
(SELECT date(max(orderdate), '-12 month')
FROM Orders)
AND date(orderdate) <
(SELECT date(max(orderdate))
FROM Orders)
GROUP BY shipcountry
ORDER BY avg(freight) DESC
LIMIT 3;
shipcountry | round(avg(freight), 2) | |
---|---|---|
0 | Ireland | 200.21 |
1 | Austria | 186.46 |
2 | USA | 119.24 |
sdf = sOrders[['orderdate','shipcountry','freight']]
sdf.show(2)
+----------+-----------+----------+ | orderdate|shipcountry| freight| +----------+-----------+----------+ |1996-07-04| France|32.3800011| |1996-07-05| Germany|11.6099997| +----------+-----------+----------+ only showing top 2 rows
from datetime import datetime
from dateutil.relativedelta import relativedelta
orderdate_max_str = sOrders.agg({'orderdate':'max'}).collect()[0][0]
orderdate_max = datetime.strptime(orderdate_max_str,'%Y-%m-%d')
orderdate_max
datetime.datetime(1998, 5, 6, 0, 0)
from datetime import datetime
from dateutil.relativedelta import relativedelta
last_year = orderdate_max - relativedelta(years=1)
last_year
datetime.datetime(1997, 5, 6, 0, 0)
orderdate_max_str = sOrders.agg({'orderdate':'max'}).collect()[0][0]
orderdate_max = datetime.strptime(orderdate_max_str,'%Y-%m-%d')
last_year = orderdate_max - relativedelta(years=1)
cond1 = sOrders.orderdate <= orderdate_max
cond2 = sOrders.orderdate > last_year
cond = cond1 & cond2
sOrders[cond]\
.groupby('shipcountry')\
.agg({'freight':'avg'})\
.orderBy('avg(freight)',ascending=False)\
.withColumn('avg_freight', F.round(F.col('avg(freight)'),2))\
.drop('avg(freight)')\
.limit(3)\
.show()
+-----------+-----------+ |shipcountry|avg_freight| +-----------+-----------+ | Ireland| 200.21| | Austria| 186.46| | USA| 119.3| +-----------+-----------+
# spark.sql(q).show() # this does not work because of function date.
We're doing inventory, and need to show information like the below, for all orders. Sort by OrderID and Product ID.
Expected Results:
EmployeeID LastName OrderID ProductName Quantity
Look at the entity relation:
# from ER diagram, we see we need to join 4 tables:
# products order_detals, orders, employees.
Product.head(2)
Id | ProductName | SupplierId | CategoryId | QuantityPerUnit | UnitPrice | UnitsInStock | UnitsOnOrder | ReorderLevel | Discontinued | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | Chai | 1 | 1 | 10 boxes x 20 bags | 18.0 | 39 | 0 | 10 | 0 |
1 | 2 | Chang | 1 | 1 | 24 - 12 oz bottles | 19.0 | 17 | 40 | 25 | 0 |
OrderDetail.head(2)
Id | OrderId | ProductId | UnitPrice | Quantity | Discount | |
---|---|---|---|---|---|---|
0 | 10248/11 | 10248 | 11 | 14.0 | 12 | 0.0 |
1 | 10248/42 | 10248 | 42 | 9.8 | 10 | 0.0 |
Orders.head(2)
orderid | customerid | employeeid | orderdate | requireddate | shippeddate | shipvia | freight | shipname | shipaddress | shipcity | shipregion | shippostalcode | shipcountry | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 10248 | VINET | 5 | 1996-07-04 | 1996-08-01 | 1996-07-16 | 3 | 32.380001 | Vins et alcools Chevalier | 59 rue de l'Abbaye | Reims | None | 51100 | France |
1 | 10249 | TOMSP | 6 | 1996-07-05 | 1996-08-16 | 1996-07-10 | 1 | 11.610000 | Toms Spezialitäten | Luisenstr. 48 | Münster | None | 44087 | Germany |
Employee.head(2)
Id | LastName | FirstName | Title | TitleOfCourtesy | BirthDate | HireDate | Address | City | Region | PostalCode | Country | HomePhone | Extension | Photo | Notes | ReportsTo | PhotoPath | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | Davolio | Nancy | Sales Representative | Ms. | 1948-12-08 | 1992-05-01 | 507 - 20th Ave. E. Apt. 2A | Seattle | WA | 98122 | USA | (206) 555-9857 | 5467 | None | Education includes a BA in psychology from Col... | 2.0 | http://accweb/emmployees/davolio.bmp |
1 | 2 | Fuller | Andrew | Vice President, Sales | Dr. | 1952-02-19 | 1992-08-14 | 908 W. Capital Way | Tacoma | WA | 98401 | USA | (206) 555-9482 | 3457 | None | Andrew received his BTS commercial in 1974 and... | NaN | http://accweb/emmployees/fuller.bmp |
p_od = (Product[['Id','ProductName']]\
.rename(columns={'Id': 'ProductId'})
.join(
OrderDetail[['OrderId','ProductId','Quantity']]
,on='ProductId',lsuffix='',rsuffix='_od'
).drop('ProductId_od',axis=1)
)
p_od.head(2)
ProductId | ProductName | OrderId | Quantity | |
---|---|---|---|---|
0 | 1 | Chai | 10248 | 10 |
1 | 2 | Chang | 10248 | 5 |
Orders[['orderid','employeeid']]\
.rename(columns={'orderid':'OrderID',
'employeeid':'EmployeeID'})\
.head(2)
OrderID | EmployeeID | |
---|---|---|
0 | 10248 | 5 |
1 | 10249 | 6 |
p_od_o = p_od.join(
Orders[['orderid','employeeid']]\
.rename(columns={'orderid':'OrderID',
'employeeid':'EmployeeID'}))
p_od_o.head(2)
ProductId | ProductName | OrderId | Quantity | OrderID | EmployeeID | |
---|---|---|---|---|---|---|
0 | 1 | Chai | 10248 | 10 | 10248 | 5 |
1 | 2 | Chang | 10248 | 5 | 10249 | 6 |
Employee[['Id','LastName']]\
.rename(columns={'Id':'EmployeeID'})\
.head(2)
EmployeeID | LastName | |
---|---|---|
0 | 1 | Davolio |
1 | 2 | Fuller |
p_od_o_e = p_od_o.join(
Employee[['Id','LastName']]\
.rename(columns={'Id':'EmployeeID'}),
on='EmployeeID',lsuffix='',rsuffix='_emp'
)\
.sort_values(['OrderID','ProductId'])\
[['EmployeeID','LastName','OrderID','ProductName','Quantity']]
p_od_o_e.head(2)
EmployeeID | LastName | OrderID | ProductName | Quantity | |
---|---|---|---|---|---|
0 | 5 | Suyama | 10248 | Chai | 10 |
1 | 6 | King | 10249 | Chang | 5 |
%%sql
-- look at different tables and write id of table below:
select * from Orders limit 1;
orderid | customerid | employeeid | orderdate | requireddate | shippeddate | shipvia | freight | shipname | shipaddress | shipcity | shipregion | shippostalcode | shipcountry | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 10248 | VINET | 5 | 1996-07-04 | 1996-08-01 | 1996-07-16 | 3 | 32.380001 | Vins et alcools Chevalier | 59 rue de l'Abbaye | Reims | None | 51100 | France |
"""
Product.Id
OrderDetail.orderid productid
Orders orderid employeeid quantity
Employee.Id LastName
""";
%%sql
SELECT e.id AS EmployeeId,
e.lastname,
o.orderid AS OrderId,
p.productname,
od.quantity
FROM Product p
JOIN OrderDetail od ON od.productid = p.id
JOIN Orders o ON o.orderid = od.orderid
JOIN Employee e ON e.id = o.employeeid
ORDER BY o.orderid,
od.productid
LIMIT 3;
EmployeeId | LastName | OrderId | ProductName | Quantity | |
---|---|---|---|---|---|
0 | 5 | Buchanan | 10248 | Queso Cabrales | 12 |
1 | 5 | Buchanan | 10248 | Singaporean Hokkien Fried Mee | 10 |
2 | 5 | Buchanan | 10248 | Mozzarella di Giovanni | 5 |
spark.sql(q).show()
+--------------+-------------------+ | CategoryName|count(CategoryName)| +--------------+-------------------+ | Confections| 13| | Condiments| 12| | Beverages| 12| | Seafood| 12| |Dairy Products| 10| |Grains/Cereals| 7| | Meat/Poultry| 6| | Produce| 5| +--------------+-------------------+
There are some customers who have never actually placed an order. Show these customers.
Customer.head(2)
Id | CompanyName | ContactName | ContactTitle | Address | City | Region | PostalCode | Country | Phone | Fax | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | ALFKI | Alfreds Futterkiste | Maria Anders | Sales Representative | Obere Str. 57 | Berlin | None | 12209 | Germany | 030-0074321 | 030-0076545 |
1 | ANATR | Ana Trujillo Emparedados y helados | Ana Trujillo | Owner | Avda. de la Constitución 2222 | México D.F. | None | 05021 | Mexico | (5) 555-4729 | (5) 555-3745 |
Orders.head(2)
orderid | customerid | employeeid | orderdate | requireddate | shippeddate | shipvia | freight | shipname | shipaddress | shipcity | shipregion | shippostalcode | shipcountry | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 10248 | VINET | 5 | 1996-07-04 | 1996-08-01 | 1996-07-16 | 3 | 32.380001 | Vins et alcools Chevalier | 59 rue de l'Abbaye | Reims | None | 51100 | France |
1 | 10249 | TOMSP | 6 | 1996-07-05 | 1996-08-16 | 1996-07-10 | 1 | 11.610000 | Toms Spezialitäten | Luisenstr. 48 | Münster | None | 44087 | Germany |
np.setdiff1d(Customer['Id'], Orders['customerid'])
array(['FISSA', 'PARIS'], dtype=object)
set(Customer['Id']).difference(Orders['customerid'])
{'FISSA', 'PARIS'}
%%sql
select * from Orders limit 1;
orderid | customerid | employeeid | orderdate | requireddate | shippeddate | shipvia | freight | shipname | shipaddress | shipcity | shipregion | shippostalcode | shipcountry | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 10248 | VINET | 5 | 1996-07-04 | 1996-08-01 | 1996-07-16 | 3 | 32.380001 | Vins et alcools Chevalier | 59 rue de l'Abbaye | Reims | None | 51100 | France |
# Customer Id
# Orders customerid
%%sql
-- We need left join (default is inner join)
SELECT c.id,
o.customerid
FROM Customer c
LEFT JOIN Orders o ON o.customerid = c.id
WHERE o.customerid IS NULL;
Id | customerid | |
---|---|---|
0 | FISSA | None |
1 | PARIS | None |
One employee (Margaret Peacock, EmployeeID 4) has placed the most orders. However, there are some customers who've never placed an order with her. Show only those customers who have never placed an order with her.
Orders.head(2)
# NOTE: order table has column customerid but these are not
# all the customers, all the ids of customers are in table Customer.
orderid | customerid | employeeid | orderdate | requireddate | shippeddate | shipvia | freight | shipname | shipaddress | shipcity | shipregion | shippostalcode | shipcountry | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 10248 | VINET | 5 | 1996-07-04 | 1996-08-01 | 1996-07-16 | 3 | 32.380001 | Vins et alcools Chevalier | 59 rue de l'Abbaye | Reims | None | 51100 | France |
1 | 10249 | TOMSP | 6 | 1996-07-05 | 1996-08-16 | 1996-07-10 | 1 | 11.610000 | Toms Spezialitäten | Luisenstr. 48 | Münster | None | 44087 | Germany |
Orders.shape
(830, 14)
Orders.employeeid.value_counts().head(2)
4 156 3 127 Name: employeeid, dtype: int64
Employee[Employee.Id==4].head(2)
Id | LastName | FirstName | Title | TitleOfCourtesy | BirthDate | HireDate | Address | City | Region | PostalCode | Country | HomePhone | Extension | Photo | Notes | ReportsTo | PhotoPath | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
3 | 4 | Peacock | Margaret | Sales Representative | Mrs. | 1937-09-19 | 1993-05-03 | 4110 Old Redmond Rd. | Redmond | WA | 98052 | USA | (206) 555-8122 | 5176 | None | Margaret holds a BA in English literature from... | 2.0 | http://accweb/emmployees/peacock.bmp |
Customer.head(2)
Id | CompanyName | ContactName | ContactTitle | Address | City | Region | PostalCode | Country | Phone | Fax | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | ALFKI | Alfreds Futterkiste | Maria Anders | Sales Representative | Obere Str. 57 | Berlin | None | 12209 | Germany | 030-0074321 | 030-0076545 |
1 | ANATR | Ana Trujillo Emparedados y helados | Ana Trujillo | Owner | Avda. de la Constitución 2222 | México D.F. | None | 05021 | Mexico | (5) 555-4729 | (5) 555-3745 |
np.setdiff1d(Customer['Id'],
Orders.loc[Orders.employeeid==4, 'customerid']) # .shape
array(['CONSH', 'DUMON', 'FISSA', 'FRANR', 'GROSR', 'LAUGB', 'LAZYK', 'NORTS', 'PARIS', 'PERIC', 'PRINI', 'SANTG', 'SEVES', 'SPECD', 'THEBI', 'VINET'], dtype=object)
%%sql
SELECT *
FROM Orders
LIMIT 2;
orderid | customerid | employeeid | orderdate | requireddate | shippeddate | shipvia | freight | shipname | shipaddress | shipcity | shipregion | shippostalcode | shipcountry | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 10248 | VINET | 5 | 1996-07-04 | 1996-08-01 | 1996-07-16 | 3 | 32.380001 | Vins et alcools Chevalier | 59 rue de l'Abbaye | Reims | None | 51100 | France |
1 | 10249 | TOMSP | 6 | 1996-07-05 | 1996-08-16 | 1996-07-10 | 1 | 11.610000 | Toms Spezialitäten | Luisenstr. 48 | Münster | None | 44087 | Germany |
# Customer: Id
# Orders: customerid employeeid
%%sql
SELECT c.Id,
o.customerid
FROM Customer c
LEFT JOIN Orders o ON (o.customerid = c.id
AND o.employeeid = 4)
WHERE o.customerid IS NULL;
Id | customerid | |
---|---|---|
0 | CONSH | None |
1 | DUMON | None |
2 | FISSA | None |
3 | FRANR | None |
4 | GROSR | None |
5 | LAUGB | None |
6 | LAZYK | None |
7 | NORTS | None |
8 | PARIS | None |
9 | PERIC | None |
10 | PRINI | None |
11 | SANTG | None |
12 | SEVES | None |
13 | SPECD | None |
14 | THEBI | None |
15 | VINET | None |
spark.sql(q).show()
+--------------+-------------------+ | CategoryName|count(CategoryName)| +--------------+-------------------+ | Confections| 13| | Seafood| 12| | Condiments| 12| | Beverages| 12| |Dairy Products| 10| |Grains/Cereals| 7| | Meat/Poultry| 6| | Produce| 5| +--------------+-------------------+
sCustomer[['Id']].show(2)
+-----+ | Id| +-----+ |ALFKI| |ANATR| +-----+ only showing top 2 rows
sOrders[sOrders.employeeid==4].select('customerid').show(2)
+----------+ |customerid| +----------+ | HANAR| | SUPRD| +----------+ only showing top 2 rows
sCustomer.select('Id').subtract(
sOrders[sOrders.employeeid==4].select('customerid')
).show()
+-----+ | Id| +-----+ |FRANR| |SEVES| |PARIS| |VINET| |SPECD| |PERIC| |PRINI| |GROSR| |CONSH| |LAZYK| |FISSA| |THEBI| |LAUGB| |SANTG| |NORTS| |DUMON| +-----+
time_taken = time.time() - time_start_notebook
h,m = divmod(time_taken,60*60)
print('Time taken: {:.0f} hr {:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))
Time taken: 0 hr 0 min 31 secs