import time
time_start_notebook = time.time()


import numpy as np
import pandas as pd

import functools # reduce

import sqlite3
import findspark, pyspark
%load_ext watermark
%watermark -a "Bhishan Poudel" -d -v -m
print()
%watermark -iv

Author: Bhishan Poudel

Python implementation: CPython
Python version       : 3.7.7
IPython version      : 7.16.1

Compiler    : Clang 4.0.1 (tags/RELEASE_401/final)
OS          : Darwin
Release     : 19.6.0
Machine     : x86_64
Processor   : i386
CPU cores   : 4
Architecture: 64bit


numpy    : 1.17.5
autopep8 : 1.5.4
sqlite3  : 2.6.0
findspark: 1.4.2
pandas   : 1.0.5
json     : 2.0.9
pyspark  : 3.1.2


# pyspark settings
import os
import findspark

HOME = os.path.expanduser('~')
findspark.init(HOME + "/Softwares/Spark/spark-3.1.2-bin-hadoop3.2")

# Set spark environments
os.environ['PYSPARK_PYTHON'] = f'{HOME}/opt/miniconda3/envs/spk/bin/python'
os.environ['PYSPARK_DRIVER_PYTHON'] = f'{HOME}/opt/miniconda3/envs/spk/bin/python'


import pyspark
from pyspark import SparkConf, SparkContext, SQLContext
from pyspark.sql.session import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.functions import col as _col
from pyspark.sql.functions import udf # @udf("integer") def myfunc(x,y): return x - y
from pyspark.sql import functions as F # stddev format_number date_format, dayofyear, when
from pyspark.sql.window import Window
from pyspark.sql.types import (StructField,
                               StringType, IntegerType,
                               FloatType,DoubleType,DateType,
                               StructType)
from pyspark.sql.functions import (mean as _mean, min as _min,
                                   max as _max, avg as _avg,
                                   when as _when
                                  )


spark = SparkSession\
        .builder\
        .appName("bhishan")\
        .getOrCreate()

sc = spark.sparkContext
sqlContext = pyspark.SQLContext(sc) # spark_df = sqlContext.createDataFrame(pandas_df)
sc.setLogLevel("INFO")
print(pyspark.__version__)

3.1.2


# load the jupyter magic from ipython-sql
%load_ext sql
%config SqlMagic.displaycon=False
%config SqlMagic.feedback=False  # no Done, rows affected
%config SqlMagic.autopandas=True


%sql sqlite:////Volumes/Media/github/Tutorials_and_Lessons/SQL/c02_Book_sql_practice_problems_57/data/Northwind.sqlite


%%sql

SELECT *
FROM sqlite_master
WHERE TYPE='table';


Employee              =  %sql SELECT * from Employee;
Category              =  %sql SELECT * from Category;
Customer              =  %sql SELECT * from Customer;
Shipper               =  %sql SELECT * from Shipper;
Supplier              =  %sql SELECT * from Supplier;
# Order               =  %sql SELECT * from Order;
Orders                 =  %sql SELECT * from Orders;
Product               =  %sql SELECT * from Product;
OrderDetail           =  %sql SELECT * from OrderDetail;
CustomerCustomerDemo  =  %sql SELECT * from CustomerCustomerDemo;
CustomerDemographic   =  %sql SELECT * from CustomerDemographic;
Region                =  %sql SELECT * from Region;
Territory             =  %sql SELECT * from Territory;
EmployeeTerritory     =  %sql SELECT * from EmployeeTerritory;


Orders['orderdate'] = pd.to_datetime(Orders['orderdate'])
Orders['requireddate'] = pd.to_datetime(Orders['requireddate'])
Orders['shippeddate'] = pd.to_datetime(Orders['shippeddate'])

Orders.dtypes

orderid                    int64
customerid                object
employeeid                 int64
orderdate         datetime64[ns]
requireddate      datetime64[ns]
shippeddate       datetime64[ns]
shipvia                    int64
freight                  float64
shipname                  object
shipaddress               object
shipcity                  object
shipregion                object
shippostalcode            object
shipcountry               object
dtype: object


dfs_names = [ 'Employee', 'Category','Customer',
             'Shipper','Supplier','Orders',
             'Product','OrderDetail',
             'CustomerCustomerDemo','CustomerDemographic',
             'Region', 'Territory', 'EmployeeTerritory']

dfs = [ Employee, Category,Customer,Shipper,Supplier,
        Orders, Product,OrderDetail,
       CustomerCustomerDemo,CustomerDemographic,
       Region, Territory, EmployeeTerritory]

for df_name, df in zip(dfs_names,dfs):
    print(df_name, df.shape)
    display(df.head(2))

Employee (9, 18)

Category (8, 3)

Customer (91, 11)

Shipper (3, 3)

Supplier (29, 12)

Orders (830, 14)

Product (77, 10)

OrderDetail (2155, 6)

CustomerCustomerDemo (0, 0)

CustomerDemographic (0, 0)


def spark_df_from_pandas(pandas_df):
    df_dtype = pandas_df.dtypes.astype(str).reset_index()
    df_dtype.columns = ['column','dtype']

    mapping = {'int64'  : 'IntegerType()',
               'float64': 'DoubleType()',
               'bool'   : 'BooleanType()',
               'object' : 'StringType()',
               'datetime64[ns]': 'DateType()',
              }

    df_dtype['dtype'] = df_dtype['dtype'].map(mapping)
    df_dtype['schema'] = "    StructField('" +\
                         df_dtype['column'] + "'," +\
                         df_dtype['dtype'] + ",True),"

    head = 'StructType([\n'
    body = '\n'.join(df_dtype['schema'])
    tail = '\n    ])'

    schema = head + body + tail
    spark_df = sqlContext.createDataFrame(pandas_df, eval(schema))
    return spark_df


sEmployee = spark_df_from_pandas(Employee)
sEmployee.createOrReplaceTempView("Employee")

sCategory = spark_df_from_pandas(Category)
sCategory.createOrReplaceTempView("Category")

sCustomer = spark_df_from_pandas(Customer)
sCustomer.createOrReplaceTempView("Customer")

sShipper = spark_df_from_pandas(Shipper)
sShipper.createOrReplaceTempView("Shipper")

sSupplier = spark_df_from_pandas(Supplier)
sSupplier.createOrReplaceTempView("Supplier")

sOrders = spark_df_from_pandas(Orders)
sOrders.createOrReplaceTempView("Orders")

sProduct = spark_df_from_pandas(Product)
sProduct.createOrReplaceTempView("Product")

sOrderDetail = spark_df_from_pandas(OrderDetail)
sOrderDetail.createOrReplaceTempView("OrderDetail")


try:
    sCustomerCustomerDemo = spark_df_from_pandas(CustomerCustomerDemo)
except:
    pass


# IndexError: list index out of range


try:
    sCustomerCustomerDemo.createOrReplaceTempView("CustomerCustomerDemo")
except:
    pass


try:
    sCustomerDemographic = spark_df_from_pandas(CustomerDemographic)
    sCustomerDemographic.createOrReplaceTempView("CustomerDemographic")
except:
    pass


sRegion = spark_df_from_pandas(Region)
sRegion.createOrReplaceTempView("Region")

sTerritory = spark_df_from_pandas(Territory)
sTerritory.createOrReplaceTempView("Territory")

sEmployeeTerritory = spark_df_from_pandas(EmployeeTerritory)
sEmployeeTerritory.createOrReplaceTempView("EmployeeTerritory")


dfs_names = [ 'Employee', 'Category','Customer',
             'Shipper','Supplier','Orders',
             'Product','OrderDetail',
             'CustomerCustomerDemo','CustomerDemographic',
             'Region', 'Territory', 'EmployeeTerritory']

dfs = [ Employee, Category,Customer,Shipper,Supplier,
        Orders, Product,OrderDetail,
       CustomerCustomerDemo,CustomerDemographic,
       Region, Territory, EmployeeTerritory]

for df_name, df in zip(dfs_names,dfs):
    print(df_name, df.shape)
    display(df.head(2))

Employee (9, 18)

Category (8, 3)

Customer (91, 11)

Shipper (3, 3)

Supplier (29, 12)

Orders (830, 14)

Product (77, 10)

OrderDetail (2155, 6)

CustomerCustomerDemo (0, 0)

CustomerDemographic (0, 0)


# pyspark settings
import os
import findspark

HOME = os.path.expanduser('~')
findspark.init(HOME + "/Softwares/Spark/spark-3.1.2-bin-hadoop3.2")

# Set spark environments
os.environ['PYSPARK_PYTHON'] = f'{HOME}/opt/miniconda3/envs/spk/bin/python'
os.environ['PYSPARK_DRIVER_PYTHON'] = f'{HOME}/opt/miniconda3/envs/spk/bin/python'


import pyspark
from pyspark import SparkConf, SparkContext, SQLContext
from pyspark.sql.session import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.functions import col as _col
from pyspark.sql.functions import udf # @udf("integer") def myfunc(x,y): return x - y
from pyspark.sql import functions as F # stddev format_number date_format, dayofyear, when
from pyspark.sql.window import Window
from pyspark.sql.types import (StructField,
                               StringType, IntegerType,
                               FloatType,DoubleType,DateType,
                               StructType)
from pyspark.sql.functions import (mean as _mean, min as _min,
                                   max as _max, avg as _avg,
                                   when as _when
                                  )


spark = SparkSession\
        .builder\
        .appName("bhishan")\
        .getOrCreate()

sc = spark.sparkContext
sqlContext = pyspark.SQLContext(sc) # spark_df = sqlContext.createDataFrame(pandas_df)
sc.setLogLevel("INFO")
print(pyspark.__version__)

3.1.2


def spark_df_from_pandas(pandas_df):
    df_dtype = pandas_df.dtypes.astype(str).reset_index()
    df_dtype.columns = ['column','dtype']

    mapping = {'int64'  : 'IntegerType()',
               'float64': 'DoubleType()',
               'bool'   : 'BooleanType()',
               'object' : 'StringType()',
               'datetime64[ns]': 'DateType()',
              }

    df_dtype['dtype'] = df_dtype['dtype'].map(mapping)
    df_dtype['schema'] = "    StructField('" +\
                         df_dtype['column'] + "'," +\
                         df_dtype['dtype'] + ",True),"

    head = 'StructType([\n'
    body = '\n'.join(df_dtype['schema'])
    tail = '\n    ])'

    schema = head + body + tail
    spark_df = sqlContext.createDataFrame(pandas_df, eval(schema))
    return spark_df


sEmployee = spark_df_from_pandas(Employee)
sEmployee.createOrReplaceTempView("Employee")

sCategory = spark_df_from_pandas(Category)
sCategory.createOrReplaceTempView("Category")

sCustomer = spark_df_from_pandas(Customer)
sCustomer.createOrReplaceTempView("Customer")

sShipper = spark_df_from_pandas(Shipper)
sShipper.createOrReplaceTempView("Shipper")

sSupplier = spark_df_from_pandas(Supplier)
sSupplier.createOrReplaceTempView("Supplier")

sOrders = spark_df_from_pandas(Orders)
sOrders.createOrReplaceTempView("Orders")

sProduct = spark_df_from_pandas(Product)
sProduct.createOrReplaceTempView("Product")

sOrderDetail = spark_df_from_pandas(OrderDetail)
sOrderDetail.createOrReplaceTempView("OrderDetail")


try:
    sCustomerCustomerDemo = spark_df_from_pandas(CustomerCustomerDemo)
except:
    pass


# IndexError: list index out of range


try:
    sCustomerCustomerDemo.createOrReplaceTempView("CustomerCustomerDemo")
except:
    pass


sRegion = spark_df_from_pandas(Region)
sRegion.createOrReplaceTempView("Region")

sTerritory = spark_df_from_pandas(Territory)
sTerritory.createOrReplaceTempView("Territory")

sEmployeeTerritory = spark_df_from_pandas(EmployeeTerritory)
sEmployeeTerritory.createOrReplaceTempView("EmployeeTerritory")


OrderDetail.head(2)


OrderDetail['Discount'].max() # discount is percentage.

0.25


OrderDetail['OrderAmount'] = (OrderDetail['UnitPrice']  
                               * OrderDetail['Quantity']
                               * (1 - OrderDetail['Discount'])
                              )

OrderDetail.head(2)


# find total order amount per orderid
OrderDetail['TotalOrderAmount'] = OrderDetail.groupby('OrderId')['OrderAmount'].transform('sum')
OrderDetail.head(2)


Orders.head(2)


Orders['orderdate'].agg([min,max])

min   1996-07-04
max   1998-05-06
Name: orderdate, dtype: datetime64[ns]


o1 = Orders.loc[Orders['orderdate'].dt.year==1997,['orderid','customerid']]
print(o1.shape)

o1.head(2)

(408, 2)


# Look at yugabyte website ER diagram to find columns relation.
# required columns from book
# CustomerID CompanyName OrderID TotalOrderAmount
# Orders     Customers    Orders  CreatedFromOrderDetails

# note: shippers has company name but orders does not have shipper_id.


Customer.head(2)


c1 = Customer[['Id','CompanyName']]
c1.head(2)


od1 = OrderDetail[['OrderId','TotalOrderAmount']]
od1 = od1[od1['TotalOrderAmount'] >= 10_000]
od1.head(2)


o1.head(2)


tmp = od1.merge(o1,left_on='OrderId',right_on='orderid',suffixes=['_od','_o'])
tmp = tmp.rename(columns={'Id':'Id_od'})
tmp.head(2)


# OrderDetail['OrderAmount'] = (  OrderDetail['UnitPrice']
#                                - OrderDetail['Discount']
#                               ) * OrderDetail['Quantity'] 

# OrderDetail['TotalOrderAmount'] = OrderDetail.groupby('OrderId')\
#                                    ['OrderAmount'].transform('sum')

tmp = tmp.merge(c1,left_on='customerid',right_on='Id')
tmp = tmp.drop(['orderid','Id'],axis=1)
tmp = tmp[['customerid','CompanyName','OrderId','TotalOrderAmount']]
tmp = tmp.drop_duplicates()
tmp = tmp.sort_values('TotalOrderAmount',ascending=False)
tmp


o1 = Orders.loc[Orders['orderdate'].dt.year==1997,['orderid','customerid']]
c1 = Customer[['Id','CompanyName']]
od1 = OrderDetail[['OrderId','TotalOrderAmount']]
od1 = od1[od1['TotalOrderAmount'] >= 10_000]

tmp = od1.merge(o1,left_on='OrderId',right_on='orderid',suffixes=['_od','_o'])
tmp = tmp.rename(columns={'Id':'Id_od'})

tmp = tmp.merge(c1,left_on='customerid',right_on='Id')
tmp = tmp.drop(['orderid','Id'],axis=1)
tmp = tmp[['customerid','CompanyName','OrderId','TotalOrderAmount']]
tmp = tmp.drop_duplicates()
tmp = tmp.sort_values('TotalOrderAmount',ascending=False)

del OrderDetail['OrderAmount']
del OrderDetail['TotalOrderAmount']

tmp


OrderDetail.head(1)


%sql SELECT * from Customer limit 1;


%sql SELECT * from Orders limit 1;


%sql SELECT * from OrderDetail limit 1;


%%sql

SELECT c.Id AS CustomerID,
       c.CompanyName,
       o.OrderID,
       sum(od.UnitPrice * od.Quantity * (1 - od.Discount)) AS amount
FROM Customer AS c
INNER JOIN Orders AS o ON c.Id = o.customerid
INNER JOIN OrderDetail AS od ON o.orderid = od.OrderId -- WHERE substr(o.orderdate,1,4) = '1997'

WHERE o.orderdate >= date('1997-01-01')
  AND o.orderdate < date('1998-01-01')
GROUP BY c.Id,
         c.CompanyName,
         o.orderid
HAVING amount >= 10000
ORDER BY amount DESC;


"""

# WHERE YEAR(cast(o.orderdate as date)) = 1997 -- postgres has function YEAR.
# WHERE strftime('%y', o.orderdate) = '1997' -- this gives empty result.

# WHERE o.orderdate >= '19970101' and o.orderdate < '19980101' -- gives empty result.

""";


%sql pragma table_info('Orders');

tmp


Orders.head(1)


OrderDetail.head(1)


Customer.head(1)


df = Orders.merge(OrderDetail,
                  left_on='orderid',right_on='OrderId',
                  suffixes=['_o','_od'])\
           .merge(Customer,
                 left_on='customerid',right_on='Id',
                 suffixes=['_o_od','_c'])\
           .loc[lambda dfx: dfx['orderdate'].dt.year==1997]\
           .assign(OrderAmount = 
                   lambda dfx: (  dfx['UnitPrice'] 
                                * dfx['Quantity']
                                *(1- dfx['Discount'])
                               ))\
           .assign(CustomerSum = 
                  lambda dfx: dfx.groupby(['customerid','CompanyName'])
                                  ['OrderAmount'].transform('sum') 
                  )\
           .loc[lambda dfx: dfx['CustomerSum']>15000]\
           .drop_duplicates(['customerid','CompanyName'])\
           .sort_values('CustomerSum',ascending=False)\
           .reset_index(drop=True)\
           [['customerid','CompanyName','CustomerSum']]


print(df.shape)
df

(7, 3)


%%sql

SELECT c.Id AS CustomerID,
       c.CompanyName,
       sum(od.UnitPrice * od.Quantity * (1-od.Discount)) AS CustomerSum
FROM Customer AS c
INNER JOIN Orders AS o ON c.Id = o.customerid
INNER JOIN OrderDetail AS od ON o.orderid = od.OrderId
WHERE substr(o.orderdate, 1, 4) = '1997'
GROUP BY c.Id,
         c.CompanyName
HAVING CustomerSum >= 15000
ORDER BY CustomerSum DESC;


# I already included discount.


# we need EmployeeID OrderID OrderDate
# Only Orders table is needed.


Orders.head(1)


Orders.groupby([Orders.orderdate.dt.year,Orders.orderdate.dt.month],as_index=False).orderdate.transform('max').head()


Orders.groupby([Orders.orderdate.dt.year,Orders.orderdate.dt.month],as_index=False).orderdate.transform('max').tail()


(Orders[Orders.orderdate.dt.day
        == Orders.orderdate.dt.days_in_month]
  .sort_values(['employeeid','orderid'])
  [['employeeid','orderid','orderdate']]
  .reset_index(drop=True)
)


for_postgres = """
-- works for postgres
SELECT EmployeeID, OrderID, OrderDate
FROM Orders
WHERE DAY(DATE_ADD(OrderDate, INTERVAL 1 DAY)) = 1
ORDER BY EmployeeID, OrderID;
"""


"""
%%sql

-- (sqlite3.OperationalError) near "OrderID": syntax error
-- only sqlite
SELECT EmployeeID,
       OrderID,
       OrderDate
FROM Orders
WHERE orderdate =
    (SELECT date(orderdate, 'start of month', '+1 month', '-1 day'))
ORDER BY EmployeeID,
         OrderID;
""";


"""
-- no such function: last_day

SELECT EmployeeID,
       OrderID,
       OrderDate
FROM Orders
WHERE orderdate = last_day(orderdate)
ORDER BY EmployeeID,
         OrderID;
"""

'\n-- no such function: last_day\n\nSELECT EmployeeID,\n       OrderID,\n       OrderDate\nFROM Orders\nWHERE orderdate = last_day(orderdate)\nORDER BY EmployeeID,\n         OrderID;\n'


Orders.head(1)


OrderDetail.head(1)


OrderDetail.groupby('OrderId')['Id'].count().nlargest(10)

OrderId
11077    25
10657     6
10847     6
10979     6
10273     5
10294     5
10309     5
10324     5
10325     5
10337     5
Name: Id, dtype: int64


%sql select * from OrderDetail limit 1;


%%sql

SELECT OrderId,
       count(*) AS TotalOrderDetail
FROM OrderDetail
GROUP BY orderid
ORDER BY TotalOrderDetail DESC
LIMIT 10;


# spark.sql(q).show()


sOrderDetail.groupby('OrderId').agg({'Id':'count'}).orderBy('count(Id)',ascending=False).limit(10).show()

+-------+---------+
|OrderId|count(Id)|
+-------+---------+
|  11077|       25|
|  10979|        6|
|  10847|        6|
|  10657|        6|
|  10514|        5|
|  10294|        5|
|  10962|        5|
|  10558|        5|
|  10845|        5|
|  10458|        5|
+-------+---------+


Orders.shape

(830, 14)


Orders.sample(frac=0.02,random_state=100).shape

(17, 14)


%%sql

SELECT *
FROM Orders
ORDER BY RANDOM()
LIMIT 2;


%%sql

SELECT cast(0.2 * count(*) AS int)
FROM Orders;


%%sql

SELECT *
FROM
  (SELECT *
   FROM Orders
   ORDER BY RANDOM()
   LIMIT
     (SELECT cast(0.2 * count(*) AS int)
      FROM Orders))
LIMIT 2;


# ms sql server
q_ms_sql = """
Select top 2 percent
OrderID
From Orders
Order By NewID()
""";


sOrders.sample(fraction=0.2).show(2)

+-------+----------+----------+----------+------------+-----------+-------+----------+--------------+---------------+--------------+----------+--------------+-----------+
|orderid|customerid|employeeid| orderdate|requireddate|shippeddate|shipvia|   freight|      shipname|    shipaddress|      shipcity|shipregion|shippostalcode|shipcountry|
+-------+----------+----------+----------+------------+-----------+-------+----------+--------------+---------------+--------------+----------+--------------+-----------+
|  10250|     HANAR|         4|1996-07-08|  1996-08-05| 1996-07-12|      2|65.8300018| Hanari Carnes|Rua do Paço, 67|Rio de Janeiro|        RJ|     05454-876|     Brazil|
|  10266|     WARTH|         3|1996-07-26|  1996-09-06| 1996-07-31|      3|25.7299995|Wartian Herkku|    Torikatu 38|          Oulu|      null|         90110|    Finland|
+-------+----------+----------+----------+------------+-----------+-------+----------+--------------+---------------+--------------+----------+--------------+-----------+
only showing top 2 rows


# spark.sql(q) # function RANDOM() is not in pyspark sql


# %%sql

# SELECT orderid,
#        customerid
# FROM Orders tablesample(2 percent);


OrderDetail.head(2)


idx = OrderDetail[OrderDetail.Quantity >= 60]\
        .groupby(['OrderId','Quantity'])['Id'].transform('count')\
        .loc[lambda x: x>1].index

# OrderDetail.loc[idx]


f = OrderDetail[OrderDetail.Quantity >= 60]\
    .groupby(['OrderId','Quantity'])['Id']\
    .filter(lambda dfx: len(dfx)>1)
       
# OrderDetail.loc[f.index]


OrderDetail[OrderDetail.Quantity >= 60]\
    .groupby(['OrderId','Quantity'])['Id']\
    .filter(lambda dfx: len(dfx)>1)\
    .pipe(lambda x: OrderDetail.loc[x.index])\
    .drop_duplicates(['OrderId','Quantity'])


# sql


%%sql

SELECT *
FROM OrderDetail
WHERE Quantity >= 60
GROUP BY orderid,
         quantity
HAVING count(*) > 1
ORDER BY orderid;


spark.sql('select * from OrderDetail limit 1').show()

+--------+-------+---------+---------+--------+--------+
|      Id|OrderId|ProductId|UnitPrice|Quantity|Discount|
+--------+-------+---------+---------+--------+--------+
|10248/11|  10248|       11|     14.0|      12|     0.0|
+--------+-------+---------+---------+--------+--------+


%%sql

SELECT OrderId,
       Quantity -- CAVEAT: select * does not work.
FROM OrderDetail
WHERE Quantity >= 60
GROUP BY OrderId,
         Quantity
HAVING count(*) > 1
ORDER BY OrderId;


OrderDetail.head(2)
# we want all orders from orderid such that this orderid matches above criteria.


dfx = OrderDetail[OrderDetail.Quantity >= 60]\
    .groupby(['OrderId','Quantity'])['Id']\
    .filter(lambda dfx: len(dfx)>1)


dfx.index

Int64Index([43, 45, 1080, 1082, 1916, 1918, 2017, 2020], dtype='int64')


OrderDetail.loc[f.index]['OrderId']

43      10263
45      10263
1080    10658
1082    10658
1916    10990
1918    10990
2017    11030
2020    11030
Name: OrderId, dtype: int64


OrderDetail[OrderDetail.Quantity >= 60]\
    .groupby(['OrderId','Quantity'])['Id']\
    .filter(lambda dfx: len(dfx)>1)\
    .pipe(lambda dfx: OrderDetail[
                        OrderDetail.OrderId.isin(
                          OrderDetail.loc[dfx.index]['OrderId'])
                                  ])\
    .reset_index(drop=True)


%%sql
-- using CTE and subquery

SELECT *
FROM OrderDetail
WHERE OrderID IN
    (SELECT DISTINCT OrderID
     FROM OrderDetail
     WHERE Quantity >= 60
     GROUP BY OrderID,
              Quantity
     HAVING count(*) > 1
     ORDER BY OrderID)
ORDER BY Quantity;


# spark.sql(q).show()


%sql select * from OrderDetail limit 1;


%%sql

SELECT od.OrderID,
       ProductID,
       UnitPrice,
       Quantity,
       Discount
FROM OrderDetail AS od
JOIN
  (SELECT OrderID
   FROM OrderDetail
   WHERE Quantity >= 60
   GROUP BY OrderID,
            Quantity
   HAVING Count(*) > 1) sub ON sub.OrderID = od.OrderID
ORDER BY od.OrderID,
         od.ProductID;


# spark.sql(q).show()


Orders.head(1)


Orders[Orders.shippeddate > Orders.requireddate].head(2)


# pd.read_sql('pragma table_info("Orders")',conn)


# we can see the pragma of date related columns are already datetype
# we dont need to cast them as date.


%%sql

SELECT *
FROM Orders
WHERE shippeddate > requireddate
LIMIT 2;


# spark.sql(q).show()


Orders[Orders.shippeddate > Orders.requireddate].head(1)


Employee.head(1)


Orders[Orders.shippeddate > Orders.requireddate]\
.groupby('employeeid')['orderid'].count().rename('TotalLateOrders')\
.sort_values(ascending=False).reset_index()\
.merge(Employee,left_on='employeeid',right_on='Id',suffixes=['_late','_emp'])\
[['employeeid','TotalLateOrders','FirstName','LastName']]


%sql select * from Orders limit 1;


%sql select * from Employee limit 1;


%%sql

SELECT e.Id,
       e.LastName,
       count(*) AS TotalLateOrders
FROM Employee e
JOIN Orders o ON o.employeeid = e.id
WHERE o.shippeddate > o.requireddate
GROUP BY o.employeeid,
         e.LastName
ORDER BY count(*) DESC;


%%sql
-- # WARNING: cast as date gives WRONG result.

SELECT e.Id,
       e.LastName,
       count(*) AS TotalLateOrders
FROM Employee e
JOIN Orders o ON o.employeeid = e.id
WHERE cast(o.shippeddate AS date) > cast(o.requireddate AS date)
GROUP BY o.employeeid,
         e.LastName
ORDER BY count(*) DESC;


%%sql

SELECT shippeddate,
       cast(o.shippeddate AS date)
FROM Orders o
LIMIT 2;


Orders.head(1)


Orders[Orders.shippeddate > Orders.requireddate]\
  .groupby('employeeid')['orderid'].count()\
  .rename('LateOrders').reset_index()


Orders.employeeid.value_counts().rename('AllOrders')\
.rename_axis('employeeid').reset_index()


Orders[Orders.shippeddate >= Orders.requireddate]\
  .groupby('employeeid')['orderid'].count()\
  .rename('LateOrders').reset_index()\
  .merge(Orders.employeeid.value_counts().rename('AllOrders')
              .rename_axis('employeeid').reset_index(),
        on='employeeid',suffixes=['_ord','_emp']
)\
.merge(
Employee, left_on='employeeid', right_on='Id',suffixes=['_old','']
)[['employeeid','LastName','AllOrders','LateOrders']]

# note: use >= not >


Employee.head(1)


%sql select * from Employee limit 1;


%sql select * from Orders limit 1;


%%sql

SELECT e.Id,
       e.LastName,
       count(*) AS TotalOrders,
       late.TotalLateOrders
FROM Employee AS e
JOIN Orders AS o ON o.employeeid = e.Id
JOIN
  (SELECT o.EmployeeId,
          e.LastName,
          count(*) AS TotalLateOrders
   FROM Employee AS e
   JOIN Orders AS o ON o.EmployeeID = e.ID
   WHERE o.shippeddate >= o.requireddate
   GROUP BY o.EmployeeId,
            e.LastName
   ORDER BY count(*) DESC) AS late ON e.Id = late.EmployeeID
GROUP BY e.Id,
         e.LastName;


# I am getting different result for Davolio with employee id 1


Orders.head(1)


Orders[Orders.employeeid==1].shape

(123, 14)


Orders[Orders.employeeid==1].head(2)


Employee.head(1)


Orders.query("employeeid==1 and shippeddate > requireddate")


# I can see employeeid 1 is Daviolio and She has TWO late shipping.
# If I use shippeddate >= requireddate, then our results match.


%%sql

SELECT e.Id,
       e.LastName,
       count(*) AS TotalOrders,
       late.TotalLateOrders
FROM Employee AS e
JOIN Orders AS o ON o.employeeid = e.Id
JOIN
  (SELECT o.EmployeeId,
          e.LastName,
          count(*) AS TotalLateOrders
   FROM Employee AS e
   JOIN Orders AS o ON o.EmployeeID = e.ID
   WHERE o.shippeddate >= o.requireddate
   GROUP BY o.EmployeeId,
            e.LastName
   ORDER BY count(*) DESC) AS late ON e.Id = late.EmployeeID
GROUP BY e.Id,
         e.LastName,
         late.TotalLateOrders;


error = """
If I do not include late.TotalLateOrders in group by
I get following error:


expression 'late.`TotalLateOrders`'
is neither present in the group by,
nor is it an aggregate function.

Add to group by or wrap in first() (or first_value)
if you don't care which value you get.

"""


%sql select * from Employee limit 1;


%sql select * from Orders limit 1;


%%sql

SELECT e.Id,
       e.LastName,
       count(*) AS TotalOrders,
       CASE
           WHEN late.TotalLateOrders IS NULL THEN 0
           ELSE late.TotalLateOrders
       END AS TotalLateOrders
FROM Employee AS e
JOIN Orders AS o ON o.EmployeeID = e.Id
LEFT JOIN
  (SELECT o.EmployeeID,
          e.LastName,
          count(*) AS TotalLateOrders
   FROM Employee AS e
   JOIN Orders AS o ON o.EmployeeID = e.Id
   WHERE o.ShippedDate >= o.RequiredDate
   GROUP BY o.EmployeeID,
            e.LastName
   ORDER BY count(*) DESC) AS late ON e.Id = late.EmployeeID
GROUP BY e.Id,
         e.LastName;


Orders[Orders.shippeddate >= Orders.requireddate]\
  .groupby('employeeid')['orderid'].count()\
  .rename('LateOrders').reset_index()\
  .merge(Orders.employeeid.value_counts().rename('AllOrders')
              .rename_axis('employeeid').reset_index(),
        on='employeeid',suffixes=['_ord','_emp']
)\
.merge(
Employee, left_on='employeeid', right_on='Id',suffixes=['_old','']
)[['employeeid','LastName','AllOrders','LateOrders']]\
.assign(
PercentLateOrders = lambda dfx: (dfx['LateOrders']/dfx['AllOrders']))


%%sql

SELECT e.Id,
       e.LastName,
       count(*) AS TotalOrders, -- late.TotalLateOrders,
 -- Handle the missing values
 CASE
     WHEN late.TotalLateOrders IS NULL THEN 0
     ELSE late.TotalLateOrders
 END AS TotalLateOrders, -- cast(late.TotalLateOrders as double) / cast(count(*) as double) as PercentLateOrders
 -- Handle missing values
 cast(CASE
          WHEN late.TotalLateOrders IS NULL THEN 0
          ELSE late.TotalLateOrders
      END AS DOUBLE) / cast(count(*) AS DOUBLE) AS PercentLateOrders
FROM Employee AS e
JOIN Orders AS o ON o.employeeid = e.Id
JOIN
  (SELECT o.EmployeeId,
          e.LastName,
          count(*) AS TotalLateOrders
   FROM Employee AS e
   JOIN Orders AS o ON o.EmployeeID = e.ID
   WHERE o.shippeddate >= o.requireddate
   GROUP BY o.EmployeeId,
            e.LastName
   ORDER BY count(*) DESC) AS late ON e.Id = late.EmployeeID
GROUP BY e.Id,
         e.LastName;


%%sql

SELECT e.Id,
       e.LastName,
       count(*) AS TotalOrders, -- late.TotalLateOrders,
 -- Handle the missing values
 CASE
     WHEN late.TotalLateOrders IS NULL THEN 0
     ELSE late.TotalLateOrders
 END AS TotalLateOrders, -- cast(late.TotalLateOrders as double) / cast(count(*) as double) as PercentLateOrders
 -- Handle missing values
 cast(CASE
          WHEN late.TotalLateOrders IS NULL THEN 0
          ELSE late.TotalLateOrders
      END AS DOUBLE) / cast(count(*) AS DOUBLE) AS PercentLateOrders
FROM Employee AS e
JOIN Orders AS o ON o.employeeid = e.Id
JOIN
  (SELECT o.EmployeeId,
          e.LastName,
          count(*) AS TotalLateOrders
   FROM Employee AS e
   JOIN Orders AS o ON o.EmployeeID = e.ID
   WHERE o.shippeddate >= o.requireddate
   GROUP BY o.EmployeeId,
            e.LastName
   ORDER BY count(*) DESC) AS late ON e.Id = late.EmployeeID
GROUP BY e.Id,
         e.LastName,
         TotalLateOrders;


%%sql

SELECT e.Id,
       e.LastName,
       count(*) AS TotalOrders, -- late.TotalLateOrders,
 -- Handle the missing values
 CASE
     WHEN late.TotalLateOrders IS NULL THEN 0
     ELSE late.TotalLateOrders
 END AS TotalLateOrders, 
 -- cast(late.TotalLateOrders as double) / cast(count(*) as double) as PercentLateOrders
 -- Handle missing values
 round(cast(CASE
                WHEN late.TotalLateOrders IS NULL THEN 0
                ELSE late.TotalLateOrders
            END AS DOUBLE) / cast(count(*) AS DOUBLE), 2) AS PercentLateOrders
FROM Employee AS e
JOIN Orders AS o ON o.employeeid = e.Id
JOIN
  (SELECT o.EmployeeId,
          e.LastName,
          count(*) AS TotalLateOrders
   FROM Employee AS e
   JOIN Orders AS o ON o.EmployeeID = e.ID
   WHERE o.shippeddate >= o.requireddate
   GROUP BY o.EmployeeId,
            e.LastName
   ORDER BY count(*) DESC) AS late ON e.Id = late.EmployeeID
GROUP BY e.Id,
         e.LastName,
         TotalLateOrders;


# OrderDetail Orders Employee
# Data only for 1997, at least one order
# make categories
# order by customerid

# CustomerID CompanyName TotalOrderAmount CustomerGroup
# Orders     Customers   OrderDetail**   CreateThis


OrderDetail.head(1)


Orders.head(1)


Customer.head(1)


( OrderDetail.drop(['Id','ProductId'],axis=1)
 
 # create new column
 .assign(OrderPrice = lambda x: x['UnitPrice'] * x['Quantity'] * (1-x['Discount']))
 
 # drop unwanted columns
 .drop(['UnitPrice', 'Quantity', 'Discount'],axis=1)
 
 # merge with Orders only in 1997
 .merge(
    Orders[['orderid','customerid','orderdate']
          ].loc[lambda x: x.orderdate.dt.year==1997],
     left_on='OrderId',right_on='orderid',suffixes=['_od','']
       )
 
 # drop unwanted columns
 .drop(['OrderId','orderid','orderdate'],axis=1)
 
 # merge with Customer
 .merge(
     Customer[['Id','CompanyName']],
     left_on='customerid',right_on='Id',suffixes=['_old','']
 
 )
 
 # drop duplicate column Id
 .drop(['Id'],axis=1)
 
 # find sum of orders for given customer
 .assign(
     TotalOrderAmount = lambda dfx: 
        dfx.groupby('customerid')['OrderPrice'].transform('sum')
        )
 
 # drop unwanted column
 .drop('OrderPrice',1)
 
 # assign cut labels
 .assign(
     CustomerGroup = lambda dfx: pd.cut(dfx['TotalOrderAmount'],
                     bins=[0,1000,5000, 10000, np.inf],
                     labels=['Low','Medium','High','VeryHigh']
                    )
 
 )
 
 # drop duplicates
 .drop_duplicates('customerid')
 
 # order by customerid
 .sort_values('customerid')

 # reset
 .reset_index(drop=True)

).pipe(lambda x: [print(x.shape), display(x.head())])

(86, 4)

[None, None]


%sql select * from Customer limit 1;


%%sql

SELECT *,
       CASE
           WHEN TotalOrder BETWEEN 0 AND 1000 THEN 'low'
           WHEN TotalOrder BETWEEN 1000 AND 5000 THEN 'medium'
           WHEN TotalOrder BETWEEN 5000 AND 10000 THEN 'high'
           WHEN TotalOrder >= 10000 THEN 'very high'
       END AS CustomerGroup
FROM
  (SELECT o.customerid,
          c.companyname,
          sum(od.unitprice * od.quantity) AS TotalOrder
   FROM Orders AS o
   JOIN OrderDetail AS od ON o.orderid = od.orderid
   JOIN Customer AS c ON o.customerid = c.id
   WHERE substr(o.orderdate, 1, 4) = '1997'
   GROUP BY o.customerid,
            c.companyname
   ORDER BY o.customerid,
            c.companyname)
LIMIT 2;


# spark.sql(q).show(3)


# in pandas, nulls are automatically ignored.


# the book says using BETWEEN works good for integers, but 
# for floats we should use >= and <


%%sql

SELECT *,
       CASE
           WHEN TotalOrder >=0
                AND TotalOrder<1000 THEN 'low'
           WHEN TotalOrder >=1000
                AND TotalOrder<5000 THEN 'medium'
           WHEN TotalOrder >= 5000
                AND TotalOrder<10000 THEN 'high'
           WHEN TotalOrder >= 10000 THEN 'very high'
       END AS CustomerGroup
FROM
  (SELECT o.customerid,
          c.companyname,
          sum(od.unitprice * od.quantity) AS TotalOrder
   FROM Orders AS o
   JOIN OrderDetail AS od ON o.orderid = od.orderid
   JOIN Customer AS c ON o.customerid = c.id
   WHERE substr(o.orderdate, 1, 4) = '1997'
   GROUP BY o.customerid,
            c.companyname
   ORDER BY o.customerid,
            c.companyname)
LIMIT 2;


( OrderDetail.drop(['Id','ProductId'],axis=1)
 
 # create new column
 .assign(OrderPrice = lambda x: x['UnitPrice'] * x['Quantity'] * (1-x['Discount']))
 
 # drop unwanted columns
 .drop(['UnitPrice', 'Quantity', 'Discount'],axis=1)
 
 # merge with Orders only in 1997
 .merge(
    Orders[['orderid','customerid','orderdate']
          ].loc[lambda x: x.orderdate.dt.year==1997],
     left_on='OrderId',right_on='orderid',suffixes=['_od','']
       )
 
 # drop unwanted columns
 .drop(['OrderId','orderid','orderdate'],axis=1)
 
 # merge with Customers
 .merge(
     Customer[['Id','CompanyName']],
     left_on='customerid',right_on='Id',suffixes=['_old','']
 
 )
 
 # drop duplicate column Id
 .drop(['Id'],axis=1)
 
 # find sum of orders for given customer
 .assign(
     TotalOrderAmount = lambda dfx: 
        dfx.groupby('customerid')['OrderPrice'].transform('sum')
        )
 
 # drop unwanted column
 .drop('OrderPrice',1)
 
 # assign cut labels
 .assign(
     CustomerGroup = lambda dfx: pd.cut(dfx['TotalOrderAmount'],
                     bins=[0,1000,5000, 10000, np.inf],
                     labels=['Low','Medium','High','VeryHigh']
                    )
 
 )
 
 # drop duplicates
 .drop_duplicates('customerid')
 
 # order by customerid
 .sort_values('customerid')

 # reset
 .reset_index(drop=True)
 
 # groupby customergroup
 .groupby('CustomerGroup').agg(
     TotalInGroup = ('TotalOrderAmount','size'),
     #TotalOrderGroup = ('TotalOrderAmount','sum'),
     
     # this does not work, since it does not give aggregated 
     # result. therefore, we need to create percentage later.
     # A = ('TotalOrderAmount', lambda dfx: dfx/dfx.sum())
     # for one group of dfx, we need one sum value.
 )
 
 # create percentage separately
 .assign(
 PercentageInGroup = lambda dfx: dfx['TotalInGroup']
                                /dfx['TotalInGroup'].sum())
 
 # sort
 .sort_values('TotalInGroup',ascending=False)

)


%%sql

SELECT o.customerid,
       coalesce(sum(od.unitprice * od.quantity), 0) AS total
FROM Orders o
INNER JOIN OrderDetail od ON o.orderid = od.orderid
WHERE substr(o.orderdate, 1, 4) = '1997'
GROUP BY o.customerid
LIMIT 2;


%%sql

SELECT *,
       CASE
           WHEN o7.total >=0
                AND o7.total <1000 THEN 'low'
           WHEN o7.total >=1000
                AND o7.total <5000 THEN 'medium'
           WHEN o7.total >= 5000
                AND o7.total <10000 THEN 'high'
           WHEN o7.total >= 10000 THEN 'very high'
       END AS mygroup
FROM
  (SELECT o.customerid,
          coalesce(sum(od.unitprice * od.quantity), 0) AS total
   FROM Orders o
   INNER JOIN OrderDetail od ON o.orderid = od.orderid
   WHERE substr(o.orderdate, 1, 4) = '1997'
   GROUP BY o.customerid) AS o7
LIMIT 2;


%%sql
SELECT  gc.mygroup, gc.group_total
        -- sum(gc.group_total) -- do this once to see the total
        ,(gc.group_total / 86.0 ) as percentage
        
        -- this gives only one row
        -- gc.group_total/ (select sum(gc.group_total))

        
FROM
(


    SELECT  cg.mygroup, count(cg.mygroup) as group_total
    FROM
    (

        select *,
           case 
           when o7.total >=0 and o7.total <1000  then 'low'
           when o7.total >=1000 and o7.total <5000  then 'medium'
           when o7.total >= 5000 and o7.total <10000 then 'high'
           when o7.total >= 10000 then 'very high'
           end as mygroup
        

        FROM
        (
            SELECT
            o.customerid,
            coalesce(sum(od.unitprice * od.quantity),0) as total
      
            FROM
                Orders o
                INNER JOIN
                OrderDetail od
                on o.orderid = od.orderid
                WHERE substr(o.orderdate,1,4) = '1997'
                GROUP BY
                o.customerid
        ) as o7
        
    ) as cg
    GROUP BY cg.mygroup
    
) as gc
ORDER BY gc.group_total DESC;


# I dont have extra table needed to solve this qn.


Supplier.head(1)


Customer.head(1)


np.union1d(Supplier['Country'],
           Customer['Country'])

array(['Argentina', 'Australia', 'Austria', 'Belgium', 'Brazil', 'Canada',
       'Denmark', 'Finland', 'France', 'Germany', 'Ireland', 'Italy',
       'Japan', 'Mexico', 'Netherlands', 'Norway', 'Poland', 'Portugal',
       'Singapore', 'Spain', 'Sweden', 'Switzerland', 'UK', 'USA',
       'Venezuela'], dtype=object)


%%sql

SELECT country
FROM Supplier
UNION
SELECT country
FROM Customer;


Supplier.head(1)


Customer.head(1)


pd.concat([Supplier.Country.drop_duplicates(),
           Customer.Country.drop_duplicates()]
          ,ignore_index=True,axis=1)\
.rename(columns={0:'SuppliersCountry',1:'CustomersCountry'})\
.sort_values(['SuppliersCountry','CustomersCountry'])


(Supplier[['Country']]
    .drop_duplicates()
    .set_index('Country',drop=False)
    .sort_index()
    .rename(columns={'Country':'SuppliersCountry'})
    .rename_axis('id')
    .reset_index()
    
    ).head(2)


(pd.merge(
(Supplier[['Country']]
    .drop_duplicates()
    .set_index('Country',drop=False)
    .sort_index()
    .rename(columns={'Country':'SuppliersCountry'})
    .rename_axis('id')
    .reset_index()
    ),
    
(Customer[['Country']]
    .drop_duplicates()
    .set_index('Country',drop=False)
    .sort_index()
    .rename(columns={'Country':'CustomersCountry'})
    .rename_axis('id')
    .reset_index()
    ),

    how='outer')
.sort_values('id').drop('id', axis=1)
 
)


t1 = Supplier.Country.drop_duplicates()
t2 = Customer.Country.drop_duplicates()

tmp1 = pd.DataFrame({'id':t1, 'supplier_country':t1})
tmp2 = pd.DataFrame({'id':t2, 'customer_country':t2})

result = pd.merge(tmp1,tmp2, how='outer')
result = result.sort_values('id').drop('id', axis=1)
result


%%sql

SELECT sp.country AS suplier_country,
       cp.country AS customer_country
FROM
  (SELECT country
   FROM supplier
   UNION SELECT country
   FROM customer) AS c
LEFT JOIN
  (SELECT DISTINCT country
   FROM supplier) AS sp ON c.country = sp.country
LEFT JOIN
  (SELECT DISTINCT country
   FROM customer) AS cp ON c.country = cp.country;


# spark.sql(q).show() # spark does not respect order of columns


q_ms_sql = """
with countries AS (
  SELECT
    country
  FROM
    suppliers
  UNION
  SELECT
    country
  FROM
    customers
), suppliercountry AS (
  SELECT
    distinct country
  FROM
    suppliers
), customercountry AS (
  SELECT
    distinct country
  FROM
    customers
)
SELECT
  sp.country as suplier_country,
  cp.country as customer_country
FROM
  countries c
LEFT JOIN
  suppliercountry sp on c.country = sp.country
LEFT JOIN
  customercountry cp on c.country = cp.country


""";


q_book = """
;With SupplierCountries as
(Select Distinct Country from Suppliers)
,CustomerCountries as
(Select Distinct Country from Customers)
Select
SupplierCountry = SupplierCountries .Country
,CustomerCountry = CustomerCountries .Country
From SupplierCountries
Full Outer Join CustomerCountries
on CustomerCountries.Country = SupplierCountries.Country

""";


q_book_good = """


Select
SupplierCountry = SupplierCountries.Country
,CustomerCountry = CustomerCountries.Country

From 
(
Select Distinct Country from Suppliers
) as SupplierCountries



Full Outer Join
(
Select Distinct Country from Customers
) as CustomerCountries

on CustomerCountries.Country = SupplierCountries.Country

""";


Supplier.head(1)


Customer.head(1)


t1 = Supplier['Country'].value_counts().rename_axis('Country').rename('TotalSuppliers').reset_index()
t1.head(2)


t2 = Customer['Country'].value_counts().rename_axis('Country').rename('TotalCustomers').reset_index()

t2.head(2)


pd.merge(t1,t2,how='outer').sort_values('Country').fillna(0).set_index("Country").astype(int)


q_ms_sql = """

with countries AS (
  SELECT country FROM suppliers
  UNION
  SELECT country FROM customers
)
SELECT
  c.country,
  coalesce(count(sp.country),0) as totalsupliers,
  coalesce(count(cp.country),0) as totalcustomers
FROM
  countries c
LEFT JOIN
  supplier sp ON c.country = sp.country
LEFT JOIN
  customer cp ON c.country = cp.country
GROUP BY
  c.country
""";


%%sql

SELECT c.country,
       coalesce(count(sp.country), 0) AS totalsupliers,
       coalesce(count(cp.country), 0) AS totalcustomers
FROM
  (SELECT country
   FROM supplier
   UNION SELECT country
   FROM customer) AS c
LEFT JOIN supplier sp ON c.country = sp.country
LEFT JOIN customer cp ON c.country = cp.country
GROUP BY c.country;


q_book = """
;With SupplierCountries as
(Select Country , Total = Count(*) from Suppliers group by Country)
,CustomerCountries as
(Select Country , Total = Count(*) from Customers group by Country)
Select
Country = isnull( SupplierCountries.Country,
CustomerCountries.Country)
,TotalSuppliers= isnull(SupplierCountries.Total,0)
,TotalCustomers= isnull(CustomerCountries.Total,0)
From SupplierCountries
Full Outer Join CustomerCountries
on CustomerCountries.Country = SupplierCountries.Country

""";


q_book_good = """

Select
ifnull(SupplierCountries.Country,
CustomerCountries.Country) as Country

,ifnull(SupplierCountries.Total,0) as TotalSuppliers

,ifnull(CustomerCountries.Total,0) as TotalCustomers

From 
(
Select Country , Total = Count(*) from Suppliers group by Country
) as SupplierCountries


Full Outer Join
( Select Country , Total = Count(*) from Customers group by Country
) as CustomerCountries


on CustomerCountries.Country = SupplierCountries.Country

""";


# pd.read_sql(q_book_good,conn)
# OperationalError: RIGHT and FULL OUTER JOINs are not currently supported
# this might pass in actual sqlite3 but fails in python module.


Orders.head(1)


# for given country, orderdate must be the first.


Orders.groupby('shipcountry').agg(
first_orderdate=('orderdate','first')).head(2)


# for a given day, there can be multiple orders.


(Orders[['shipcountry','customerid', 'orderid', 'orderdate']]
 
.assign(
    first_orderdate = 
        lambda dfx: dfx.groupby('shipcountry'
                           )['orderdate'].transform('first')
)
 
# select only first date
.loc[lambda dfx: dfx.orderdate==dfx.first_orderdate]

# sort
.sort_values(['shipcountry','orderid'])

)


q_ms_sql = """
with ranked_orders AS (
  SELECT
    shipcountry,
    customerid,
    orderid,
    orderdate,
    rank() OVER (PARTITION BY shipcountry ORDER BY orderdate) as ranking
  FROM
    orders
)
SELECT
  shipcountry,
  customerid,
  orderid,
  orderdate
FROM
  ranked_orders
WHERE
  ranking = 1
ORDER BY
  shipcountry

""";


%%sql

SELECT shipcountry,
       customerid,
       orderid,
       orderdate
FROM
  (SELECT shipcountry,
          customerid,
          orderid,
          orderdate,
          rank() OVER (PARTITION BY shipcountry
                       ORDER BY orderdate) AS ranking
   FROM orders) AS ranked_orders
WHERE ranking = 1
ORDER BY shipcountry;


q_book = """
;with OrdersByCountry as
(
Select
ShipCountry
,CustomerID
,OrderID
,OrderDate = convert(date, OrderDate)
,RowNumberPerCountry =
Row_Number()
over (Partition by ShipCountry Order by ShipCountry, OrderID)
From Orders
)
Select
ShipCountry
,CustomerID
,OrderID
,OrderDate
From OrdersByCountry
Where
RowNumberPerCountry = 1
Order by
ShipCountry

""";


q_book_good = """

Select
ShipCountry
,CustomerID
,OrderID
,OrderDate

From 
(
Select
    ShipCountry
    ,CustomerID
    ,OrderID
    ,OrderDate
    ,(Row_Number() over (Partition by ShipCountry
                          Order by ShipCountry, OrderID)) 
    as RowNumberPerCountry

From Orders
) as OrdersByCountry

Where
RowNumberPerCountry = 1
Order by
ShipCountry

""";


Orders.head(1)


(Orders
 
# first sort 
.sort_values(['customerid', 'orderdate'])
 
# new column
.assign(
    daysbetween = lambda dfx: dfx['orderdate'].diff().dt.days
    ) 

# join to itself which is shifted by -1
# NOTE: Here old Orders do not have column daysbetween
.join(Orders.groupby('customerid').shift(-1), 
                      lsuffix='_initial',
                      rsuffix='_next')

# after joining drop unwanted cols
# .drop('daysbetween_initial', axis=1)
 
# filter
# .query('daysbetween_next <= 5 and daysbetween_next >=0')

).head(1)


(Orders
 
# first sort 
.sort_values(['customerid', 'orderdate'])
 
# new column
.assign(
    daysbetween = lambda dfx: dfx['orderdate'].diff().dt.days
    ) 

# join to itself which is shifted by -1
# NOTE: We need other dataframe which also has column daysbetween
.pipe(lambda dfx:
      dfx.join(dfx.groupby('customerid').shift(-1), 
                lsuffix='_initial',rsuffix='_next')
      )


# after joining drop unwanted cols
.drop('daysbetween_initial', axis=1)
 
# filter
# .query('daysbetween_next <= 5 and daysbetween_next >=0')
.query(' 0<= daysbetween_next <= 5 ')

).pipe(lambda x: [x.shape, display(x.head())])

[(69, 28), None]


# first sort 
Orders = Orders.sort_values(['customerid', 'orderdate'])
 
# new column
Orders['daysbetween'] = Orders['orderdate'].diff().dt.days

# join to itself which is shifted by -1
ans = Orders.join(
    Orders.groupby('customerid').shift(-1), 
    lsuffix='_initial',rsuffix='_next')
      

# after joining drop unwanted cols
ans = ans.drop('daysbetween_initial', axis=1)
 
# filter
ans.query('daysbetween_next <= 5 and daysbetween_next >=0').tail(2)


%%sql

SELECT initial_order.customerid,
       initial_order.orderid AS initial_order_id,
       initial_order.orderdate AS initial_order_date,
       next_order.orderid AS nextorderid,
       next_order.orderdate AS nextorderdate,
       CAST ((JulianDay(next_order.orderdate)
              - JulianDay(initial_order.orderdate)) AS int) AS daysbetween
FROM orders initial_order
INNER JOIN orders next_order ON initial_order.customerid = next_order.customerid
WHERE initial_order.orderid < next_order.orderid
  AND (JulianDay(next_order.orderdate) 
       - JulianDay(initial_order.orderdate)) <= 5
LIMIT 2;


from datetime import datetime
datetime(1996,10,1) - datetime(1996,9,26)

datetime.timedelta(days=5)


# customer ISLAT has bought at least twice within 5 days of
# 1996-09-26 to 1996-10-01


q_ms_sql = """
Select
InitialOrder.CustomerID
,InitialOrderID = InitialOrder.OrderID
,InitialOrderDate = convert(date, InitialOrder.OrderDate)
,NextOrderID = NextOrder.OrderID
,NextOrderDate = convert(date, NextOrder.OrderDate)
,DaysBetween = datediff(dd, InitialOrder.OrderDate,
NextOrder.OrderDate)
from Orders InitialOrder
join Orders NextOrder
on InitialOrder.CustomerID = NextOrder.CustomerID
where
InitialOrder.OrderID < NextOrder.OrderID
and datediff(dd, InitialOrder.OrderDate, NextOrder.OrderDate) <= 5
Order by
InitialOrder.CustomerID
,InitialOrder.OrderID

""";


%%sql

SELECT initial_order.customerid,
       initial_order.orderid AS initial_order_id,
       initial_order.orderdate AS initial_order_date,
       next_order.orderid AS nextorderid,
       next_order.orderdate AS nextorderdate,
       CAST ((JulianDay(next_order.orderdate) 
              - JulianDay(initial_order.orderdate)) AS int) AS daysbetween
FROM orders initial_order
INNER JOIN orders next_order ON initial_order.customerid = next_order.customerid
WHERE initial_order.orderid < next_order.orderid
  AND (JulianDay(next_order.orderdate) - JulianDay(initial_order.orderdate)) <= 5
LIMIT 2;


q_ms_sql = """
WITH customer_orders AS (
  SELECT
    customerid,
    orderid as initial_order_id,
    orderdate as initial_order_date,
    LEAD (o.orderdate, 1) OVER ( PARTITION BY customerid
    ORDER BY orderdate) AS next_order_date
  FROM
    orders o
  ORDER BY
    o.customerid, o.orderdate
)
SELECT
  *,
  (co.next_order_date - co.initial_order_date) as daysbetween
FROM
  customer_orders co
WHERE
  (co.next_order_date - co.initial_order_date) <= 5
""";


%%sql

SELECT *,
       cast((JulianDay(co.next_order_date) - JulianDay(co.initial_order_date))
            AS int) AS daysbetween
FROM
  (SELECT customerid,
          orderid AS initial_order_id,
          orderdate AS initial_order_date,
          LEAD (o.orderdate, 1) OVER (PARTITION BY customerid
                                      ORDER BY orderdate) AS next_order_date
   FROM orders o
   ORDER BY o.customerid,
            o.orderdate) AS co
WHERE (JulianDay(co.next_order_date) - JulianDay(co.initial_order_date)) <= 5
LIMIT 2;


time_taken = time.time() - time_start_notebook
h,m = divmod(time_taken,60*60)
print('Time taken: {:.0f} hr {:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))

Time taken: 0 hr 0 min 26 secs

	Id	LastName	FirstName	Title	TitleOfCourtesy	BirthDate	HireDate	Address	City	Region	PostalCode	Country	HomePhone	Extension	Photo	Notes	ReportsTo	PhotoPath
0	1	Davolio	Nancy	Sales Representative	Ms.	1948-12-08	1992-05-01	507 - 20th Ave. E. Apt. 2A	Seattle	WA	98122	USA	(206) 555-9857	5467	None	Education includes a BA in psychology from Col...	2.0	http://accweb/emmployees/davolio.bmp
1	2	Fuller	Andrew	Vice President, Sales	Dr.	1952-02-19	1992-08-14	908 W. Capital Way	Tacoma	WA	98401	USA	(206) 555-9482	3457	None	Andrew received his BTS commercial in 1974 and...	NaN	http://accweb/emmployees/fuller.bmp

	Id	CategoryName	Description
0	1	Beverages	Soft drinks, coffees, teas, beers, and ales
1	2	Condiments	Sweet and savory sauces, relishes, spreads, an...

	Id	CompanyName	ContactName	ContactTitle	Address	City	Region	PostalCode	Country	Phone	Fax
0	ALFKI	Alfreds Futterkiste	Maria Anders	Sales Representative	Obere Str. 57	Berlin	None	12209	Germany	030-0074321	030-0076545
1	ANATR	Ana Trujillo Emparedados y helados	Ana Trujillo	Owner	Avda. de la Constitución 2222	México D.F.	None	05021	Mexico	(5) 555-4729	(5) 555-3745

	Id	CompanyName	Phone
0	1	Speedy Express	(503) 555-9831
1	2	United Package	(503) 555-3199

	Id	CompanyName	ContactName	ContactTitle	Address	City	Region	PostalCode	Country	Phone	Fax	HomePage
0	1	Exotic Liquids	Charlotte Cooper	Purchasing Manager	49 Gilbert St.	London	None	EC1 4SD	UK	(171) 555-2222	None	None
1	2	New Orleans Cajun Delights	Shelley Burke	Order Administrator	P.O. Box 78934	New Orleans	LA	70117	USA	(100) 555-4822	None	#CAJUN.HTM#

Table of Contents

Description¶

Imports¶

Load the database¶

Get Pandas dataframes¶

Pyspark dataframes¶

Peek at dataframes¶

PySpark dataframes¶

Advanced SQL Questions¶

32. High-value customers¶

33. High-value customers - total orders¶

34. High-value customers - with discount¶

35. Month-end orders¶

36. Orders with many line items¶

37. Orders - random assortment¶

38. Orders - accidental double-entry¶

39. Orders - accidental double-entry details¶

40. Orders - accidental double-entry¶

41. Late orders¶

42. Late orders - which employees?¶

43. Late orders vs. total orders¶

45. Late orders vs. total orders - fix null¶

46. Late orders vs. total orders - percentage¶

47. Late orders vs. total orders - fix decimal¶

48. Customer grouping¶

49. Customer grouping - fix null¶

50. Customer grouping with percentage¶

51. Customer grouping - flexible¶

52. Countries with suppliers or customers¶

53. Countries with suppliers or customers, version 2¶

54. Countries with suppliers or customers - version 3¶

55. First order in each country¶

56. Customers with multiple orders in 5 day period¶

57. Customers with multiple orders in 5 day period, version 2¶

Time Taken¶

	type	name	tbl_name	rootpage	sql
0	table	Employee	Employee	2	CREATE TABLE "Employee" \n(\n "Id" INTEGER PR...
1	table	Category	Category	3	CREATE TABLE "Category" \n(\n "Id" INTEGER PR...
2	table	Customer	Customer	4	CREATE TABLE "Customer" \n(\n "Id" VARCHAR(80...
3	table	Shipper	Shipper	8	CREATE TABLE "Shipper" \n(\n "Id" INTEGER PRI...
4	table	Supplier	Supplier	9	CREATE TABLE "Supplier" \n(\n "Id" INTEGER PR...
5	table	Product	Product	12	CREATE TABLE "Product" \n(\n "Id" INTEGER PRI...
6	table	OrderDetail	OrderDetail	14	CREATE TABLE "OrderDetail" \n(\n "Id" VARCHAR...
7	table	CustomerCustomerDemo	CustomerCustomerDemo	16	CREATE TABLE "CustomerCustomerDemo" \n(\n "Id...
8	table	CustomerDemographic	CustomerDemographic	18	CREATE TABLE "CustomerDemographic" \n(\n "Id"...
9	table	Region	Region	21	CREATE TABLE "Region" \n(\n "Id" INTEGER PRIM...
10	table	Territory	Territory	22	CREATE TABLE "Territory" \n(\n "Id" VARCHAR(8...
11	table	EmployeeTerritory	EmployeeTerritory	24	CREATE TABLE "EmployeeTerritory" \n(\n "Id" V...
12	table	Order	Order	11	CREATE TABLE "Order" (\n"index" INTEGER,\n "O...
13	table	orders	orders	155	CREATE TABLE orders (\n orderid smallint NO...

	orderid	customerid	employeeid	orderdate	requireddate	shippeddate	shipvia	freight	shipname	shipaddress	shipcity	shipregion	shippostalcode	shipcountry
0	10248	VINET	5	1996-07-04	1996-08-01	1996-07-16	3	32.380001	Vins et alcools Chevalier	59 rue de l'Abbaye	Reims	None	51100	France
1	10249	TOMSP	6	1996-07-05	1996-08-16	1996-07-10	1	11.610000	Toms Spezialitäten	Luisenstr. 48	Münster	None	44087	Germany

	Id	ProductName	SupplierId	CategoryId	QuantityPerUnit	UnitPrice	UnitsInStock	UnitsOnOrder	ReorderLevel	Discontinued
0	1	Chai	1	1	10 boxes x 20 bags	18.0	39	0	10	0
1	2	Chang	1	1	24 - 12 oz bottles	19.0	17	40	25	0

	Id	OrderId	ProductId	UnitPrice	Quantity	Discount	OrderAmount	TotalOrderAmount
0	10248/11	10248	11	14.0	12	0.0	168.0	440.0
1	10248/42	10248	42	9.8	10	0.0	98.0	440.0

	customerid	CompanyName	OrderId	TotalOrderAmount
0	SIMOB	Simons bistro	10417	11188.4
4	RATTC	Rattlesnake Canyon Grocery	10479	10495.6
8	QUICK	QUICK-Stop	10540	10191.7
12	QUICK	QUICK-Stop	10691	10164.8

	cid	name	type	notnull	dflt_value
0	0	orderid	smallint	1	None
1	1	customerid	bpchar	0	None
2	2	employeeid	smallint	0	None
3	3	orderdate	date	0	None
4	4	requireddate	date	0	None
5	5	shippeddate	date	0	None
6	6	shipvia	smallint	0	None
7	7	freight	real	0	None
8	8	shipname	character varying(40)	0	None
9	9	shipaddress	character varying(60)	0	None
10	10	shipcity	character varying(15)	0	None
11	11	shipregion	character varying(15)	0	None
12	12	shippostalcode	character varying(10)	0	None
13	13	shipcountry	character varying(15)	0	None

	customerid	CompanyName	CustomerSum
0	QUICK	QUICK-Stop	61109.9100
1	SAVEA	Save-a-lot Markets	57713.5750
2	ERNSH	Ernst Handel	48096.2635
3	MEREP	Mère Paillarde	23332.3100
4	HUNGO	Hungry Owl All-Night Grocers	20454.4050
5	RATTC	Rattlesnake Canyon Grocery	19383.7500
6	SIMOB	Simons bistro	16232.4125

	orderdate
825	1998-05-06
826	1998-05-06
827	1998-05-06
828	1998-05-06
829	1998-05-06

	employeeid	orderid	orderdate
0	1	10461	1997-02-28
1	1	10616	1997-07-31
2	2	10583	1997-06-30
3	2	10686	1997-09-30
4	2	10989	1998-03-31
5	2	11060	1998-04-30
6	3	10432	1997-01-31
7	3	10806	1997-12-31
8	3	10988	1998-03-31
9	3	11063	1998-04-30
10	4	10343	1996-10-31
11	4	10522	1997-04-30
12	4	10584	1997-06-30
13	4	10617	1997-07-31
14	4	10725	1997-10-31
15	4	10807	1997-12-31
16	4	11061	1998-04-30
17	4	11062	1998-04-30
18	5	10269	1996-07-31
19	6	10317	1996-09-30
20	7	10490	1997-03-31
21	8	10399	1996-12-31
22	8	10460	1997-02-28
23	8	10491	1997-03-31
24	8	10987	1998-03-31
25	9	10687	1997-09-30

	OrderId	TotalOrderDetail
0	11077	25
1	10979	6
2	10847	6
3	10657	6
4	11064	5
5	11031	5
6	11021	5
7	10962	5
8	10893	5
9	10861	5

	Id	RegionDescription
0	1	Eastern
1	2	Western

	Id	TerritoryDescription	RegionId
0	01581	Westboro	1
1	01730	Bedford	1

	Id	EmployeeId	TerritoryId
0	1/06897	1	06897
1	1/19713	1	19713

	Id	RegionDescription
0	1	Eastern
1	2	Western

	orderid	customerid	employeeid	orderdate	requireddate	shippeddate	shipvia	freight	shipname	shipaddress	shipcity	shipregion	shippostalcode	shipcountry
0	10927	LACOR	4	1998-03-05	1998-04-02	1998-04-08	1	19.790001	La corne d'abondance	67, avenue de l'Europe	Versailles	None	78000	France
1	10293	TORTU	1	1996-08-29	1996-09-26	1996-09-11	3	21.180000	Tortuga Restaurante	Avda. Azteca 123	México D.F.	None	05033	Mexico

	orderid	customerid	employeeid	orderdate	requireddate	shippeddate	shipvia	freight	shipname	shipaddress	shipcity	shipregion	shippostalcode	shipcountry
0	10603	SAVEA	8	1997-07-18	1997-08-15	1997-08-08	2	48.770001	Save-a-lot Markets	187 Suffolk Ln.	Boise	ID	83720	USA
1	10757	SAVEA	6	1997-11-27	1997-12-25	1997-12-15	1	8.190000	Save-a-lot Markets	187 Suffolk Ln.	Boise	ID	83720	USA

	Id	OrderId	ProductId	UnitPrice	Quantity	Discount
43	10263/16	10263	16	13.9	60	0.25
1080	10658/40	10658	40	18.4	70	0.05
1916	10990/21	10990	21	10.0	65	0.00
2017	11030/2	11030	2	19.0	100	0.25

	Id	OrderId	ProductId	UnitPrice	Quantity	Discount
0	10263/16	10263	16	13.90	60	0.25
1	10263/24	10263	24	3.60	28	0.00
2	10263/30	10263	30	20.70	60	0.25
3	10263/74	10263	74	8.00	36	0.25
4	10658/21	10658	21	10.00	60	0.00
5	10658/40	10658	40	18.40	70	0.05
6	10658/60	10658	60	34.00	55	0.05
7	10658/77	10658	77	13.00	70	0.05
8	10990/21	10990	21	10.00	65	0.00
9	10990/34	10990	34	14.00	60	0.15
10	10990/55	10990	55	24.00	65	0.15
11	10990/61	10990	61	28.50	66	0.15
12	11030/2	11030	2	19.00	100	0.25
13	11030/5	11030	5	21.35	70	0.00
14	11030/29	11030	29	123.79	60	0.25
15	11030/59	11030	59	55.00	100	0.25

	orderid	customerid	employeeid	orderdate	requireddate	shippeddate	shipvia	freight	shipname	shipaddress	shipcity	shipregion	shippostalcode	shipcountry
16	10264	FOLKO	6	1996-07-24	1996-08-21	1996-08-23	3	3.67	Folk och fä HB	Åkergatan 24	Bräcke	None	S-844 67	Sweden
23	10271	SPLIR	6	1996-08-01	1996-08-29	1996-08-30	2	4.54	Split Rail Beer & Ale	P.O. Box 555	Lander	WY	82520	USA

	employeeid	TotalLateOrders	FirstName	LastName
0	4	10	Margaret	Peacock
1	3	5	Janet	Leverling
2	9	4	Anne	Dodsworth
3	8	4	Laura	Callahan
4	7	4	Robert	King
5	2	4	Andrew	Fuller
6	6	3	Michael	Suyama
7	1	2	Nancy	Davolio
8	5	1	Steven	Buchanan

	orderid	customerid	employeeid	orderdate	requireddate	shippeddate	shipvia	freight	shipname	shipaddress	shipcity	shipregion	shippostalcode	shipcountry
10	10258	ERNSH	1	1996-07-17	1996-08-14	1996-07-23	1	140.509995	Ernst Handel	Kirchgasse 6	Graz	None	8010	Austria
22	10270	WARTH	1	1996-08-01	1996-08-29	1996-08-02	1	136.539993	Wartian Herkku	Torikatu 38	Oulu	None	90110	Finland

	orderid	customerid	employeeid	orderdate	requireddate	shippeddate	shipvia	freight	shipname	shipaddress	shipcity	shipregion	shippostalcode	shipcountry
461	10709	GOURL	1	1997-10-17	1997-11-14	1997-11-20	3	210.800003	Gourmet Lanchonetes	Av. Brasil, 442	Campinas	SP	04876-786	Brazil
579	10827	BONAP	1	1998-01-12	1998-01-26	1998-02-06	2	63.540001	Bon app'	12, rue des Bouchers	Marseille	None	13008	France

	employeeid	LastName	AllOrders	LateOrders	PercentLateOrders
0	1	Davolio	123	3	0.024390
1	2	Fuller	96	4	0.041667
2	3	Leverling	127	5	0.039370
3	4	Peacock	156	10	0.064103
4	5	Buchanan	42	1	0.023810
5	6	Suyama	67	3	0.044776
6	7	King	72	4	0.055556
7	8	Callahan	104	5	0.048077
8	9	Dodsworth	43	5	0.116279

	Id	LastName	TotalOrders	TotalLateOrders	PercentLateOrders
0	1	Davolio	123	3	0.02
1	2	Fuller	96	4	0.04
2	3	Leverling	127	5	0.04
3	4	Peacock	156	10	0.06
4	5	Buchanan	42	1	0.02
5	6	Suyama	67	3	0.04
6	7	King	72	4	0.06
7	8	Callahan	104	5	0.05
8	9	Dodsworth	43	5	0.12

	customerid	CompanyName	TotalOrderAmount	CustomerGroup
0	ALFKI	Alfreds Futterkiste	2022.5000	Medium
1	ANATR	Ana Trujillo Emparedados y helados	799.7500	Low
2	ANTON	Antonio Moreno Taquería	5960.7775	High
3	AROUT	Around the Horn	6406.9000	High
4	BERGS	Berglunds snabbköp	13849.0150	VeryHigh

	customerid	companyname	TotalOrder	CustomerGroup
0	ALFKI	Alfreds Futterkiste	2294.00	medium
1	ANATR	Ana Trujillo Emparedados y helados	799.75	low

	TotalInGroup	PercentageInGroup
CustomerGroup
Medium	30	0.348837
High	23	0.267442
VeryHigh	17	0.197674
Low	16	0.186047

	mygroup	group_total	percentage
0	medium	29	0.337209
1	high	23	0.267442
2	very high	18	0.209302
3	low	16	0.186047

	Country
0	Argentina
1	Australia
2	Austria
3	Belgium
4	Brazil
5	Canada
6	Denmark
7	Finland
8	France
9	Germany
10	Ireland
11	Italy
12	Japan
13	Mexico
14	Netherlands
15	Norway
16	Poland
17	Portugal
18	Singapore
19	Spain
20	Sweden
21	Switzerland
22	UK
23	USA
24	Venezuela

	shipcountry	customerid	orderid	orderdate	first_orderdate
161	Argentina	OCEAN	10409	1997-01-09	1997-01-09
10	Austria	ERNSH	10258	1996-07-17	1996-07-17
4	Belgium	SUPRD	10252	1996-07-09	1996-07-09
2	Brazil	HANAR	10250	1996-07-08	1996-07-08
84	Canada	MEREP	10332	1996-10-17	1996-10-17
93	Denmark	SIMOB	10341	1996-10-29	1996-10-29
18	Finland	WARTH	10266	1996-07-26	1996-07-26
0	France	VINET	10248	1996-07-04	1996-07-04
1	Germany	TOMSP	10249	1996-07-05	1996-07-05
50	Ireland	HUNGO	10298	1996-09-05	1996-09-05
27	Italy	MAGAA	10275	1996-08-07	1996-08-07
11	Mexico	CENTC	10259	1996-07-18	1996-07-18
139	Norway	SANTG	10387	1996-12-18	1996-12-18
126	Poland	WOLZA	10374	1996-12-05	1996-12-05
80	Portugal	FURIB	10328	1996-10-14	1996-10-14
33	Spain	ROMEY	10281	1996-08-14	1996-08-14
16	Sweden	FOLKO	10264	1996-07-24	1996-07-24
6	Switzerland	CHOPS	10254	1996-07-11	1996-07-11
41	UK	BSBEV	10289	1996-08-26	1996-08-26
14	USA	RATTC	10262	1996-07-22	1996-07-22
9	Venezuela	HILAA	10257	1996-07-16	1996-07-16

	orderid_initial	customerid	employeeid_initial	orderdate_initial	requireddate_initial	shippeddate_initial	shipvia_initial	freight_initial	shipname_initial	shipaddress_initial	...	shippeddate_next	shipvia_next	freight_next	shipname_next	shipaddress_next	shipcity_next	shipregion_next	shippostalcode_next	shipcountry_next	daysbetween_next
429	10677	ANTON	1	1997-09-22	1997-10-20	1997-09-26	3	4.030000	Antonio Moreno Taquería	Mataderos 2312	...	1997-10-01	2.0	36.130001	Antonio Moreno Taquería	Mataderos 2312	México D.F.	None	05023	Mexico	3.0
493	10741	AROUT	4	1997-11-14	1997-11-28	1997-11-18	3	10.960000	Around the Horn	Brook Farm Stratford St. Mary	...	1997-11-21	2.0	23.719999	Around the Horn	Brook Farm Stratford St. Mary	Colchester	Essex	CO7 6JX	UK	3.0
30	10278	BERGS	8	1996-08-12	1996-09-09	1996-08-16	2	92.690002	Berglunds snabbköp	Berguvsvägen 8	...	1996-09-12	1.0	8.980000	Berglunds snabbköp	Berguvsvägen 8	Luleå	None	S-958 22	Sweden	2.0
196	10444	BERGS	3	1997-02-12	1997-03-12	1997-02-21	3	3.500000	Berglunds snabbköp	Berguvsvägen 8	...	1997-02-20	1.0	9.300000	Berglunds snabbköp	Berguvsvägen 8	Luleå	None	S-958 22	Sweden	1.0
618	10866	BERGS	5	1998-02-03	1998-03-03	1998-02-12	1	109.110001	Berglunds snabbköp	Berguvsvägen 8	...	1998-03-03	2.0	32.369999	Berglunds snabbköp	Berguvsvägen 8	Luleå	None	S-958 22	Sweden	3.0

	orderid_initial	customerid	employeeid_initial	orderdate_initial	requireddate_initial	shippeddate_initial	shipvia_initial	freight_initial	shipname_initial	shipaddress_initial	...	shippeddate_next	shipvia_next	freight_next	shipname_next	shipaddress_next	shipcity_next	shipregion_next	shippostalcode_next	shipcountry_next	daysbetween_next
445	10693	WHITC	3	1997-10-06	1997-10-20	1997-10-10	3	139.339996	White Clover Markets	1029 - 12th Ave. S.	...	1997-10-14	3.0	102.550003	White Clover Markets	1029 - 12th Ave. S.	Seattle	WA	98124	USA	2.0
625	10873	WILMK	4	1998-02-06	1998-03-06	1998-02-09	1	0.820000	Wilman Kala	Keskuskatu 45	...	1998-02-12	3.0	8.500000	Wilman Kala	Keskuskatu 45	Helsinki	None	21240	Finland	4.0