Spark3
to run sql commandsRelational Schema for Computer Store Database
import time
time_start_notebook = time.time()
# my local library
import sys
sys.path.append("/Users/poudel/Dropbox/a00_Bhishan_Modules/")
sys.path.append("/Users/poudel/Dropbox/a00_Bhishan_Modules/bhishan")
from bhishan import bp
import numpy as np
import pandas as pd
import os
import json
%load_ext watermark
%watermark -a "Bhishan Poudel" -d -v -m
print()
%watermark -iv
The watermark extension is already loaded. To reload it, use: %reload_ext watermark Author: Bhishan Poudel Python implementation: CPython Python version : 3.7.7 IPython version : 7.16.1 Compiler : Clang 4.0.1 (tags/RELEASE_401/final) OS : Darwin Release : 19.6.0 Machine : x86_64 Processor : i386 CPU cores : 4 Architecture: 64bit autopep8: 1.5.4 json : 2.0.9 sys : 3.7.7 (default, May 6 2020, 04:59:01) [Clang 4.0.1 (tags/RELEASE_401/final)] pandas : 1.0.5 numpy : 1.17.5 bhishan : 0.4
!ls ~/.config/postgres
credentials.json
# postgres configs
import os,json
data = json.load(open(os.path.expanduser('~/.config/postgres/credentials.json')))
pw,port = data['password'], data['port'] # port is 5432
dbname = 'wbstore' # make sure pgadmin is running and given database is there.
%load_ext sql
%config SqlMagic.displaycon=False
%config SqlMagic.feedback=False # no Done, rows affected
%config SqlMagic.autopandas=True
%sql postgres://postgres:$pw@localhost:$port/$dbname
The sql extension is already loaded. To reload it, use: %reload_ext sql
We do in only once. Once we added these tables to our database dvdrental, we are done. We can now use these table now for all the future times.
%%sql
DROP TABLE IF EXISTS Manufacturers CASCADE;
%%sql
CREATE TABLE Manufacturers (Code INTEGER, Name VARCHAR(255) NOT NULL,
PRIMARY KEY (Code));
%%sql
INSERT INTO Manufacturers(Code, Name)
VALUES(1,'Sony');
INSERT INTO Manufacturers(Code, Name)
VALUES(2,'Creative Labs');
INSERT INTO Manufacturers(Code, Name)
VALUES(3,'Hewlett-Packard');
INSERT INTO Manufacturers(Code, Name)
VALUES(4,'Iomega');
INSERT INTO Manufacturers(Code, Name)
VALUES(5,'Fujitsu');
INSERT INTO Manufacturers(Code, Name)
VALUES(6,'Winchester');
INSERT INTO Manufacturers(Code, Name)
VALUES(7,'Bose');
%%sql
SELECT *
FROM Manufacturers;
code | name | |
---|---|---|
0 | 1 | Sony |
1 | 2 | Creative Labs |
2 | 3 | Hewlett-Packard |
3 | 4 | Iomega |
4 | 5 | Fujitsu |
5 | 6 | Winchester |
6 | 7 | Bose |
%%sql
DROP TABLE IF EXISTS Products CASCADE;
%%sql
CREATE TABLE Products (Code INTEGER, Name VARCHAR(255) NOT NULL,
Price DECIMAL NOT NULL,
Manufacturer INTEGER NOT NULL,
PRIMARY KEY (Code),
FOREIGN KEY (Manufacturer) REFERENCES Manufacturers(Code)) ;
%%sql
INSERT INTO Products(Code, Name, Price, Manufacturer)
VALUES(1, 'Hard drive', 240, 5);
INSERT INTO Products(Code, Name, Price, Manufacturer)
VALUES(2, 'Memory', 120, 6);
INSERT INTO Products(Code, Name, Price, Manufacturer)
VALUES(3, 'ZIP drive', 150, 4);
INSERT INTO Products(Code, Name, Price, Manufacturer)
VALUES(4, 'Floppy disk', 5, 6);
INSERT INTO Products(Code, Name, Price, Manufacturer)
VALUES(5, 'Monitor', 240, 1);
INSERT INTO Products(Code, Name, Price, Manufacturer)
VALUES(6, 'DVD drive', 180, 2);
INSERT INTO Products(Code, Name, Price, Manufacturer)
VALUES(7, 'CD drive', 90, 2);
INSERT INTO Products(Code, Name, Price, Manufacturer)
VALUES(8, 'Printer', 270, 3);
INSERT INTO Products(Code, Name, Price, Manufacturer)
VALUES(9, 'Toner cartridge', 66, 3);
INSERT INTO Products(Code, Name, Price, Manufacturer)
VALUES(10, 'DVD burner', 180, 2);
%%sql
SELECT *
FROM Manufacturers;
code | name | |
---|---|---|
0 | 1 | Sony |
1 | 2 | Creative Labs |
2 | 3 | Hewlett-Packard |
3 | 4 | Iomega |
4 | 5 | Fujitsu |
5 | 6 | Winchester |
6 | 7 | Bose |
dfm = pd.DataFrame({'code': [1, 2, 3, 4, 5, 6,7],
'name': ['Sony', 'Creative Labs', 'Hewlett-Packard', 'Iomega', 'Fujitsu', 'Winchester','Bose']})
dfm
code | name | |
---|---|---|
0 | 1 | Sony |
1 | 2 | Creative Labs |
2 | 3 | Hewlett-Packard |
3 | 4 | Iomega |
4 | 5 | Fujitsu |
5 | 6 | Winchester |
6 | 7 | Bose |
%%sql
SELECT *
FROM Products;
code | name | price | manufacturer | |
---|---|---|---|---|
0 | 1 | Hard drive | 240 | 5 |
1 | 2 | Memory | 120 | 6 |
2 | 3 | ZIP drive | 150 | 4 |
3 | 4 | Floppy disk | 5 | 6 |
4 | 5 | Monitor | 240 | 1 |
5 | 6 | DVD drive | 180 | 2 |
6 | 7 | CD drive | 90 | 2 |
7 | 8 | Printer | 270 | 3 |
8 | 9 | Toner cartridge | 66 | 3 |
9 | 10 | DVD burner | 180 | 2 |
x = %sql select * from Products;
dfp = x # x.DataFrame() we need x.DataFrame() if we dont have %config SqlMagic.autopandas=True
dfp.columns = dfp.columns.str.lower()
dfp.dtypes
code int64 name object price object manufacturer int64 dtype: object
dfp.infer_objects().dtypes
code int64 name object price object manufacturer int64 dtype: object
dfp['price'] = dfp['price'].astype(int)
dfp
code | name | price | manufacturer | |
---|---|---|---|---|
0 | 1 | Hard drive | 240 | 5 |
1 | 2 | Memory | 120 | 6 |
2 | 3 | ZIP drive | 150 | 4 |
3 | 4 | Floppy disk | 5 | 6 |
4 | 5 | Monitor | 240 | 1 |
5 | 6 | DVD drive | 180 | 2 |
6 | 7 | CD drive | 90 | 2 |
7 | 8 | Printer | 270 | 3 |
8 | 9 | Toner cartridge | 66 | 3 |
9 | 10 | DVD burner | 180 | 2 |
df = dfp.merge(dfm,left_on='manufacturer', right_on='code',suffixes=['_product','_manufacturer'])
df
code_product | name_product | price | manufacturer | code_manufacturer | name_manufacturer | |
---|---|---|---|---|---|---|
0 | 1 | Hard drive | 240 | 5 | 5 | Fujitsu |
1 | 2 | Memory | 120 | 6 | 6 | Winchester |
2 | 4 | Floppy disk | 5 | 6 | 6 | Winchester |
3 | 3 | ZIP drive | 150 | 4 | 4 | Iomega |
4 | 5 | Monitor | 240 | 1 | 1 | Sony |
5 | 6 | DVD drive | 180 | 2 | 2 | Creative Labs |
6 | 7 | CD drive | 90 | 2 | 2 | Creative Labs |
7 | 10 | DVD burner | 180 | 2 | 2 | Creative Labs |
8 | 8 | Printer | 270 | 3 | 3 | Hewlett-Packard |
9 | 9 | Toner cartridge | 66 | 3 | 3 | Hewlett-Packard |
%%sql
SELECT Name
FROM Products;
name | |
---|---|
0 | Hard drive |
1 | Memory |
2 | ZIP drive |
3 | Floppy disk |
4 | Monitor |
5 | DVD drive |
6 | CD drive |
7 | Printer |
8 | Toner cartridge |
9 | DVD burner |
# dfp['name'] # series
dfp[['name']] # dataframe
name | |
---|---|
0 | Hard drive |
1 | Memory |
2 | ZIP drive |
3 | Floppy disk |
4 | Monitor |
5 | DVD drive |
6 | CD drive |
7 | Printer |
8 | Toner cartridge |
9 | DVD burner |
dfp.columns
Index(['code', 'name', 'price', 'manufacturer'], dtype='object')
%%sql
-- select name,price from Products;
SELECT Name,
Price
FROM Products;
name | price | |
---|---|---|
0 | Hard drive | 240 |
1 | Memory | 120 |
2 | ZIP drive | 150 |
3 | Floppy disk | 5 |
4 | Monitor | 240 |
5 | DVD drive | 180 |
6 | CD drive | 90 |
7 | Printer | 270 |
8 | Toner cartridge | 66 |
9 | DVD burner | 180 |
dfp[['name','price']]
name | price | |
---|---|---|
0 | Hard drive | 240 |
1 | Memory | 120 |
2 | ZIP drive | 150 |
3 | Floppy disk | 5 |
4 | Monitor | 240 |
5 | DVD drive | 180 |
6 | CD drive | 90 |
7 | Printer | 270 |
8 | Toner cartridge | 66 |
9 | DVD burner | 180 |
dfp.head(1)
code | name | price | manufacturer | |
---|---|---|---|---|
0 | 1 | Hard drive | 240 | 5 |
dfp.loc[dfp.price <=200, 'name']
1 Memory 2 ZIP drive 3 Floppy disk 5 DVD drive 6 CD drive 8 Toner cartridge 9 DVD burner Name: name, dtype: object
dfp.query("price <= 200")#['name']
code | name | price | manufacturer | |
---|---|---|---|---|
1 | 2 | Memory | 120 | 6 |
2 | 3 | ZIP drive | 150 | 4 |
3 | 4 | Floppy disk | 5 | 6 |
5 | 6 | DVD drive | 180 | 2 |
6 | 7 | CD drive | 90 | 2 |
8 | 9 | Toner cartridge | 66 | 3 |
9 | 10 | DVD burner | 180 | 2 |
%%sql
SELECT name
FROM Products
WHERE price <= 200;
name | |
---|---|
0 | Memory |
1 | ZIP drive |
2 | Floppy disk |
3 | DVD drive |
4 | CD drive |
5 | Toner cartridge |
6 | DVD burner |
dfp.query("60<= price <= 120")
code | name | price | manufacturer | |
---|---|---|---|---|
1 | 2 | Memory | 120 | 6 |
6 | 7 | CD drive | 90 | 2 |
8 | 9 | Toner cartridge | 66 | 3 |
dfp.loc[ (dfp.price>=60) & (dfp.price<=120)]
code | name | price | manufacturer | |
---|---|---|---|---|
1 | 2 | Memory | 120 | 6 |
6 | 7 | CD drive | 90 | 2 |
8 | 9 | Toner cartridge | 66 | 3 |
# dfp.loc[ 60<= dfp.price <=120] # syntax error.
dfp.loc[dfp.price.between(60,120)]
code | name | price | manufacturer | |
---|---|---|---|---|
1 | 2 | Memory | 120 | 6 |
6 | 7 | CD drive | 90 | 2 |
8 | 9 | Toner cartridge | 66 | 3 |
%%sql
SELECT *
FROM Products
WHERE price BETWEEN 60 AND 120;
code | name | price | manufacturer | |
---|---|---|---|---|
0 | 2 | Memory | 120 | 6 |
1 | 7 | CD drive | 90 | 2 |
2 | 9 | Toner cartridge | 66 | 3 |
%%sql
SELECT *
FROM Products
WHERE price >= 60
AND Price <= 120;
code | name | price | manufacturer | |
---|---|---|---|---|
0 | 2 | Memory | 120 | 6 |
1 | 7 | CD drive | 90 | 2 |
2 | 9 | Toner cartridge | 66 | 3 |
# dfp['price_cents'] = dfp['price'] * 100
# dfp[['name','price_cents']]
dfp.assign(
price_cents = dfp['price']*100
)[['name','price_cents']]
name | price_cents | |
---|---|---|
0 | Hard drive | 24000 |
1 | Memory | 12000 |
2 | ZIP drive | 15000 |
3 | Floppy disk | 500 |
4 | Monitor | 24000 |
5 | DVD drive | 18000 |
6 | CD drive | 9000 |
7 | Printer | 27000 |
8 | Toner cartridge | 6600 |
9 | DVD burner | 18000 |
%%sql
SELECT name,
price * 100 AS price_cents
FROM Products;
name | price_cents | |
---|---|---|
0 | Hard drive | 24000 |
1 | Memory | 12000 |
2 | ZIP drive | 15000 |
3 | Floppy disk | 500 |
4 | Monitor | 24000 |
5 | DVD drive | 18000 |
6 | CD drive | 9000 |
7 | Printer | 27000 |
8 | Toner cartridge | 6600 |
9 | DVD burner | 18000 |
dfp
code | name | price | manufacturer | |
---|---|---|---|---|
0 | 1 | Hard drive | 240 | 5 |
1 | 2 | Memory | 120 | 6 |
2 | 3 | ZIP drive | 150 | 4 |
3 | 4 | Floppy disk | 5 | 6 |
4 | 5 | Monitor | 240 | 1 |
5 | 6 | DVD drive | 180 | 2 |
6 | 7 | CD drive | 90 | 2 |
7 | 8 | Printer | 270 | 3 |
8 | 9 | Toner cartridge | 66 | 3 |
9 | 10 | DVD burner | 180 | 2 |
dfp.shape[0], dfp['name'].nunique()
(10, 10)
dfp.groupby('name')['price'].mean()
name CD drive 90 DVD burner 180 DVD drive 180 Floppy disk 5 Hard drive 240 Memory 120 Monitor 240 Printer 270 Toner cartridge 66 ZIP drive 150 Name: price, dtype: int64
dfp['price'].mean()
154.1
%%sql
select avg(price) from Products;
avg | |
---|---|
0 | 154.1000000000000000 |
dfp.price[dfp.manufacturer==2].mean()
150.0
# Never do:
dfp[dfp.manufacturer==2]['price'].mean()
# never select whole dataframe, if you just need one series.
150.0
%%sql
SELECT avg(price)
FROM Products
WHERE manufacturer=2;
avg | |
---|---|
0 | 150.0000000000000000 |
dfp[dfp.price >= 180].shape[0]
5
dfp.price[dfp.price >= 180].shape[0]
5
%%sql
SELECT count(*)
FROM Products
WHERE price >= 180;
count | |
---|---|
0 | 5 |
dfp[['name','price']].query("price >= 180")\
.sort_values(['price','name'],ascending=[False,True])
name | price | |
---|---|---|
7 | Printer | 270 |
0 | Hard drive | 240 |
4 | Monitor | 240 |
9 | DVD burner | 180 |
5 | DVD drive | 180 |
%%sql
SELECT name,
price
FROM Products
WHERE price >= 180
ORDER BY price DESC,
name ASC;
name | price | |
---|---|---|
0 | Printer | 270 |
1 | Hard drive | 240 |
2 | Monitor | 240 |
3 | DVD burner | 180 |
4 | DVD drive | 180 |
dfp
code | name | price | manufacturer | |
---|---|---|---|---|
0 | 1 | Hard drive | 240 | 5 |
1 | 2 | Memory | 120 | 6 |
2 | 3 | ZIP drive | 150 | 4 |
3 | 4 | Floppy disk | 5 | 6 |
4 | 5 | Monitor | 240 | 1 |
5 | 6 | DVD drive | 180 | 2 |
6 | 7 | CD drive | 90 | 2 |
7 | 8 | Printer | 270 | 3 |
8 | 9 | Toner cartridge | 66 | 3 |
9 | 10 | DVD burner | 180 | 2 |
dfm
code | name | |
---|---|---|
0 | 1 | Sony |
1 | 2 | Creative Labs |
2 | 3 | Hewlett-Packard |
3 | 4 | Iomega |
4 | 5 | Fujitsu |
5 | 6 | Winchester |
6 | 7 | Bose |
dfp.merge(dfm,left_on='manufacturer', right_on='code',how='left')
code_x | name_x | price | manufacturer | code_y | name_y | |
---|---|---|---|---|---|---|
0 | 1 | Hard drive | 240 | 5 | 5 | Fujitsu |
1 | 2 | Memory | 120 | 6 | 6 | Winchester |
2 | 3 | ZIP drive | 150 | 4 | 4 | Iomega |
3 | 4 | Floppy disk | 5 | 6 | 6 | Winchester |
4 | 5 | Monitor | 240 | 1 | 1 | Sony |
5 | 6 | DVD drive | 180 | 2 | 2 | Creative Labs |
6 | 7 | CD drive | 90 | 2 | 2 | Creative Labs |
7 | 8 | Printer | 270 | 3 | 3 | Hewlett-Packard |
8 | 9 | Toner cartridge | 66 | 3 | 3 | Hewlett-Packard |
9 | 10 | DVD burner | 180 | 2 | 2 | Creative Labs |
%%sql
SELECT *
FROM Products p,
Manufacturers m
WHERE p.Manufacturer = m.Code;
code | name | price | manufacturer | code | name | |
---|---|---|---|---|---|---|
0 | 1 | Hard drive | 240 | 5 | 5 | Fujitsu |
1 | 2 | Memory | 120 | 6 | 6 | Winchester |
2 | 3 | ZIP drive | 150 | 4 | 4 | Iomega |
3 | 4 | Floppy disk | 5 | 6 | 6 | Winchester |
4 | 5 | Monitor | 240 | 1 | 1 | Sony |
5 | 6 | DVD drive | 180 | 2 | 2 | Creative Labs |
6 | 7 | CD drive | 90 | 2 | 2 | Creative Labs |
7 | 8 | Printer | 270 | 3 | 3 | Hewlett-Packard |
8 | 9 | Toner cartridge | 66 | 3 | 3 | Hewlett-Packard |
9 | 10 | DVD burner | 180 | 2 | 2 | Creative Labs |
%%sql
SELECT *
FROM Products p
INNER JOIN Manufacturers m ON p.Manufacturer = m.Code;
code | name | price | manufacturer | code | name | |
---|---|---|---|---|---|---|
0 | 1 | Hard drive | 240 | 5 | 5 | Fujitsu |
1 | 2 | Memory | 120 | 6 | 6 | Winchester |
2 | 3 | ZIP drive | 150 | 4 | 4 | Iomega |
3 | 4 | Floppy disk | 5 | 6 | 6 | Winchester |
4 | 5 | Monitor | 240 | 1 | 1 | Sony |
5 | 6 | DVD drive | 180 | 2 | 2 | Creative Labs |
6 | 7 | CD drive | 90 | 2 | 2 | Creative Labs |
7 | 8 | Printer | 270 | 3 | 3 | Hewlett-Packard |
8 | 9 | Toner cartridge | 66 | 3 | 3 | Hewlett-Packard |
9 | 10 | DVD burner | 180 | 2 | 2 | Creative Labs |
%%sql
SELECT *
FROM Products p
LEFT JOIN Manufacturers m ON p.Manufacturer = m.Code;
code | name | price | manufacturer | code | name | |
---|---|---|---|---|---|---|
0 | 1 | Hard drive | 240 | 5 | 5 | Fujitsu |
1 | 2 | Memory | 120 | 6 | 6 | Winchester |
2 | 3 | ZIP drive | 150 | 4 | 4 | Iomega |
3 | 4 | Floppy disk | 5 | 6 | 6 | Winchester |
4 | 5 | Monitor | 240 | 1 | 1 | Sony |
5 | 6 | DVD drive | 180 | 2 | 2 | Creative Labs |
6 | 7 | CD drive | 90 | 2 | 2 | Creative Labs |
7 | 8 | Printer | 270 | 3 | 3 | Hewlett-Packard |
8 | 9 | Toner cartridge | 66 | 3 | 3 | Hewlett-Packard |
9 | 10 | DVD burner | 180 | 2 | 2 | Creative Labs |
dfp.head(2)
code | name | price | manufacturer | |
---|---|---|---|---|
0 | 1 | Hard drive | 240 | 5 |
1 | 2 | Memory | 120 | 6 |
dfm.head(2)
code | name | |
---|---|---|
0 | 1 | Sony |
1 | 2 | Creative Labs |
dfp.merge(dfm,left_on='manufacturer',right_on='code',
suffixes=['_product','_manufacturer']
)[['name_product','price','name_manufacturer']]
name_product | price | name_manufacturer | |
---|---|---|---|
0 | Hard drive | 240 | Fujitsu |
1 | Memory | 120 | Winchester |
2 | Floppy disk | 5 | Winchester |
3 | ZIP drive | 150 | Iomega |
4 | Monitor | 240 | Sony |
5 | DVD drive | 180 | Creative Labs |
6 | CD drive | 90 | Creative Labs |
7 | DVD burner | 180 | Creative Labs |
8 | Printer | 270 | Hewlett-Packard |
9 | Toner cartridge | 66 | Hewlett-Packard |
# dfm.set_index('code')
# dfm.set_index('code')['name']
# dfp['manufacturer'].map(dfm.set_index('code')['name'])
dfp[['name','price']].assign(
name_manufacturer = dfp['manufacturer'].map(dfm.set_index('code')['name']))
name | price | name_manufacturer | |
---|---|---|---|
0 | Hard drive | 240 | Fujitsu |
1 | Memory | 120 | Winchester |
2 | ZIP drive | 150 | Iomega |
3 | Floppy disk | 5 | Winchester |
4 | Monitor | 240 | Sony |
5 | DVD drive | 180 | Creative Labs |
6 | CD drive | 90 | Creative Labs |
7 | Printer | 270 | Hewlett-Packard |
8 | Toner cartridge | 66 | Hewlett-Packard |
9 | DVD burner | 180 | Creative Labs |
# using two tables and where
%%sql
SELECT p.name,
p.price,
m.name AS manufacturer
FROM Products p,
Manufacturers m
WHERE p.manufacturer=m.Code;
name | price | manufacturer | |
---|---|---|---|
0 | Hard drive | 240 | Fujitsu |
1 | Memory | 120 | Winchester |
2 | ZIP drive | 150 | Iomega |
3 | Floppy disk | 5 | Winchester |
4 | Monitor | 240 | Sony |
5 | DVD drive | 180 | Creative Labs |
6 | CD drive | 90 | Creative Labs |
7 | Printer | 270 | Hewlett-Packard |
8 | Toner cartridge | 66 | Hewlett-Packard |
9 | DVD burner | 180 | Creative Labs |
# using inner join and ON
%%sql
SELECT p.name,
p.price,
m.name AS manufacturer
FROM Products p
INNER JOIN Manufacturers m ON p.manufacturer=m.code;
name | price | manufacturer | |
---|---|---|---|
0 | Hard drive | 240 | Fujitsu |
1 | Memory | 120 | Winchester |
2 | ZIP drive | 150 | Iomega |
3 | Floppy disk | 5 | Winchester |
4 | Monitor | 240 | Sony |
5 | DVD drive | 180 | Creative Labs |
6 | CD drive | 90 | Creative Labs |
7 | Printer | 270 | Hewlett-Packard |
8 | Toner cartridge | 66 | Hewlett-Packard |
9 | DVD burner | 180 | Creative Labs |
dfp.head(2)
code | name | price | manufacturer | |
---|---|---|---|---|
0 | 1 | Hard drive | 240 | 5 |
1 | 2 | Memory | 120 | 6 |
dfm.head(2)
code | name | |
---|---|---|
0 | 1 | Sony |
1 | 2 | Creative Labs |
dfp.groupby('manufacturer')['price'].mean()
manufacturer 1 240.0 2 150.0 3 168.0 4 150.0 5 240.0 6 62.5 Name: price, dtype: float64
dfp['manufacturer'].unique()
array([5, 6, 4, 1, 2, 3])
%%sql
SELECT manufacturer,
round(avg(price), 2) AS avg_price
FROM Products
GROUP BY manufacturer;
manufacturer | avg_price | |
---|---|---|
0 | 3 | 168.00 |
1 | 5 | 240.00 |
2 | 4 | 150.00 |
3 | 6 | 62.50 |
4 | 2 | 150.00 |
5 | 1 | 240.00 |
%%sql
SELECT manufacturer,
round(avg(price), 2) AS avg_price
FROM Products
GROUP BY manufacturer
ORDER BY manufacturer;
manufacturer | avg_price | |
---|---|---|
0 | 1 | 240.00 |
1 | 2 | 150.00 |
2 | 3 | 168.00 |
3 | 4 | 150.00 |
4 | 5 | 240.00 |
5 | 6 | 62.50 |
dfp.groupby('manufacturer')['price'].mean()
manufacturer 1 240.0 2 150.0 3 168.0 4 150.0 5 240.0 6 62.5 Name: price, dtype: float64
dfp.merge(dfm,left_on='manufacturer', right_on='code',
suffixes=['_product','_manufacturer'])\
.groupby('name_manufacturer')['price'].mean()
name_manufacturer Creative Labs 150.0 Fujitsu 240.0 Hewlett-Packard 168.0 Iomega 150.0 Sony 240.0 Winchester 62.5 Name: price, dtype: float64
%%sql
SELECT m.name,
round(avg(price), 2) AS avg_price
FROM Products p
INNER JOIN Manufacturers m ON p.manufacturer = m.code
GROUP BY m.name;
name | avg_price | |
---|---|---|
0 | Sony | 240.00 |
1 | Iomega | 150.00 |
2 | Hewlett-Packard | 168.00 |
3 | Winchester | 62.50 |
4 | Creative Labs | 150.00 |
5 | Fujitsu | 240.00 |
%%sql
SELECT m.name,
round(avg(price), 2) AS avg_price
FROM Products p,
Manufacturers m
WHERE p.manufacturer = m.code
GROUP BY m.name;
name | avg_price | |
---|---|---|
0 | Sony | 240.00 |
1 | Iomega | 150.00 |
2 | Hewlett-Packard | 168.00 |
3 | Winchester | 62.50 |
4 | Creative Labs | 150.00 |
5 | Fujitsu | 240.00 |
dfp.head(2)
code | name | price | manufacturer | |
---|---|---|---|---|
0 | 1 | Hard drive | 240 | 5 |
1 | 2 | Memory | 120 | 6 |
dfp.merge(dfm,left_on='manufacturer', right_on='code',
suffixes=['_product','_manufacturer'])\
.groupby('name_manufacturer')['price'].mean()\
.rename('avg_price').rename_axis('manufacturer')\
.loc[lambda x: x>=150]\
.reset_index()
manufacturer | avg_price | |
---|---|---|
0 | Creative Labs | 150.0 |
1 | Fujitsu | 240.0 |
2 | Hewlett-Packard | 168.0 |
3 | Iomega | 150.0 |
4 | Sony | 240.0 |
%%sql
SELECT m.name AS manufacturer,
round(avg(price), 2) AS avg_price
FROM Products p
INNER JOIN Manufacturers m ON p.manufacturer = m.code
GROUP BY m.name
HAVING avg(price) >= 150;
manufacturer | avg_price | |
---|---|---|
0 | Sony | 240.00 |
1 | Iomega | 150.00 |
2 | Hewlett-Packard | 168.00 |
3 | Creative Labs | 150.00 |
4 | Fujitsu | 240.00 |
dfp.head(2)
code | name | price | manufacturer | |
---|---|---|---|---|
0 | 1 | Hard drive | 240 | 5 |
1 | 2 | Memory | 120 | 6 |
dfp.loc[dfp.price==dfp.price.min(), ['name','price']]
name | price | |
---|---|---|
3 | Floppy disk | 5 |
m = dfp.price.min()
dfp.query("price == @m")[['name','price']]
name | price | |
---|---|---|
3 | Floppy disk | 5 |
dfp.query("price == @dfp.price.min()")[['name','price']]
name | price | |
---|---|---|
3 | Floppy disk | 5 |
# %%sql
# select name, price
# from Products
# where price = min(price); # this does not work
%%sql
SELECT name,
price
FROM Products
WHERE price =
(SELECT min(price)
FROM Products);
name | price | |
---|---|---|
0 | Floppy disk | 5 |
%%sql
SELECT name,
price
FROM Products
ORDER BY price ASC
LIMIT 1;
name | price | |
---|---|---|
0 | Floppy disk | 5 |
dfp.nsmallest(1,'price')[['name','price']]
name | price | |
---|---|---|
3 | Floppy disk | 5 |
dfp.sort_values('price').head(1)[['name','price']]
name | price | |
---|---|---|
3 | Floppy disk | 5 |
dfp.sort_values('price').iloc[[0]][['name','price']]
name | price | |
---|---|---|
3 | Floppy disk | 5 |
dfp.merge(dfm,left_on='manufacturer', right_on='code',
suffixes=['_product','_manufacturer'])\
.nlargest(1,'price')[['name_manufacturer','price']]
name_manufacturer | price | |
---|---|---|
8 | Hewlett-Packard | 270 |
%%sql
SELECT m.name,
price
FROM Products p,
Manufacturers m
WHERE (p.manufacturer=m.code
AND price =
(SELECT max(price)
FROM Products));
name | price | |
---|---|---|
0 | Hewlett-Packard | 270 |
%%sql
SELECT m.name,
price
FROM Products p
INNER JOIN Manufacturers m ON p.manufacturer=m.code
ORDER BY price DESC
LIMIT 1;
name | price | |
---|---|---|
0 | Hewlett-Packard | 270 |
%%sql
SELECT p.name,
p.price,
m.name
FROM Products p,
Manufacturers m
WHERE p.manufacturer = m.code
AND p.price =
(SELECT max(p.price)
FROM Products p
WHERE p.Manufacturer = m.Code );
name | price | name | |
---|---|---|---|
0 | Hard drive | 240 | Fujitsu |
1 | Memory | 120 | Winchester |
2 | ZIP drive | 150 | Iomega |
3 | Monitor | 240 | Sony |
4 | DVD drive | 180 | Creative Labs |
5 | Printer | 270 | Hewlett-Packard |
6 | DVD burner | 180 | Creative Labs |
%%sql
-- here, subquery has P and M and it has only one max value.
SELECT P.name,
P.price,
M.name
FROM Products P,
Manufacturers M
WHERE P.manufacturer = M.code
AND P.price =
(SELECT max(P.price)
FROM Products P,
Manufacturers M
WHERE P.Manufacturer = M.Code );
name | price | name | |
---|---|---|---|
0 | Printer | 270 | Hewlett-Packard |
%%sql
SELECT P.Name,
P.Price,
M.Name
FROM Products P
INNER JOIN Manufacturers M ON P.Manufacturer = M.Code
AND P.Price =
(SELECT max(P.Price)
FROM Products P
WHERE P.Manufacturer = M.Code );
name | price | name | |
---|---|---|---|
0 | Hard drive | 240 | Fujitsu |
1 | Memory | 120 | Winchester |
2 | ZIP drive | 150 | Iomega |
3 | Monitor | 240 | Sony |
4 | DVD drive | 180 | Creative Labs |
5 | Printer | 270 | Hewlett-Packard |
6 | DVD burner | 180 | Creative Labs |
%%sql
SELECT max(P.Price)
FROM Products P,
Manufacturers M
WHERE P.Manufacturer = M.Code;
max | |
---|---|
0 | 270 |
df = dfp.merge(dfm,left_on='manufacturer', right_on='code',suffixes=['_product','_manufacturer'])
df
code_product | name_product | price | manufacturer | code_manufacturer | name_manufacturer | |
---|---|---|---|---|---|---|
0 | 1 | Hard drive | 240 | 5 | 5 | Fujitsu |
1 | 2 | Memory | 120 | 6 | 6 | Winchester |
2 | 4 | Floppy disk | 5 | 6 | 6 | Winchester |
3 | 3 | ZIP drive | 150 | 4 | 4 | Iomega |
4 | 5 | Monitor | 240 | 1 | 1 | Sony |
5 | 6 | DVD drive | 180 | 2 | 2 | Creative Labs |
6 | 7 | CD drive | 90 | 2 | 2 | Creative Labs |
7 | 10 | DVD burner | 180 | 2 | 2 | Creative Labs |
8 | 8 | Printer | 270 | 3 | 3 | Hewlett-Packard |
9 | 9 | Toner cartridge | 66 | 3 | 3 | Hewlett-Packard |
# for each manufactures, seleect rows with max price for that manufacturer.
df['max_price_manufacturer'] = df.groupby('name_manufacturer')['price'].transform(max)
df
code_product | name_product | price | manufacturer | code_manufacturer | name_manufacturer | max_price_manufacturer | |
---|---|---|---|---|---|---|---|
0 | 1 | Hard drive | 240 | 5 | 5 | Fujitsu | 240 |
1 | 2 | Memory | 120 | 6 | 6 | Winchester | 120 |
2 | 4 | Floppy disk | 5 | 6 | 6 | Winchester | 120 |
3 | 3 | ZIP drive | 150 | 4 | 4 | Iomega | 150 |
4 | 5 | Monitor | 240 | 1 | 1 | Sony | 240 |
5 | 6 | DVD drive | 180 | 2 | 2 | Creative Labs | 180 |
6 | 7 | CD drive | 90 | 2 | 2 | Creative Labs | 180 |
7 | 10 | DVD burner | 180 | 2 | 2 | Creative Labs | 180 |
8 | 8 | Printer | 270 | 3 | 3 | Hewlett-Packard | 270 |
9 | 9 | Toner cartridge | 66 | 3 | 3 | Hewlett-Packard | 270 |
df[df.price==df.max_price_manufacturer]
code_product | name_product | price | manufacturer | code_manufacturer | name_manufacturer | max_price_manufacturer | |
---|---|---|---|---|---|---|---|
0 | 1 | Hard drive | 240 | 5 | 5 | Fujitsu | 240 |
1 | 2 | Memory | 120 | 6 | 6 | Winchester | 120 |
3 | 3 | ZIP drive | 150 | 4 | 4 | Iomega | 150 |
4 | 5 | Monitor | 240 | 1 | 1 | Sony | 240 |
5 | 6 | DVD drive | 180 | 2 | 2 | Creative Labs | 180 |
7 | 10 | DVD burner | 180 | 2 | 2 | Creative Labs | 180 |
8 | 8 | Printer | 270 | 3 | 3 | Hewlett-Packard | 270 |
dfp.tail(2)
code | name | price | manufacturer | |
---|---|---|---|---|
8 | 9 | Toner cartridge | 66 | 3 |
9 | 10 | DVD burner | 180 | 2 |
dfp.loc[len(dfp)] = [dfp.code.iloc[-1]+1, 'Loudspeakers', 70, 2]
dfp
code | name | price | manufacturer | |
---|---|---|---|---|
0 | 1 | Hard drive | 240 | 5 |
1 | 2 | Memory | 120 | 6 |
2 | 3 | ZIP drive | 150 | 4 |
3 | 4 | Floppy disk | 5 | 6 |
4 | 5 | Monitor | 240 | 1 |
5 | 6 | DVD drive | 180 | 2 |
6 | 7 | CD drive | 90 | 2 |
7 | 8 | Printer | 270 | 3 |
8 | 9 | Toner cartridge | 66 | 3 |
9 | 10 | DVD burner | 180 | 2 |
10 | 11 | Loudspeakers | 70 | 2 |
%%sql
INSERT INTO Products(code, name, price, manufacturer)
VALUES (11, 'Loudspeakers',70, 2);
%%sql
SELECT *
FROM Products;
code | name | price | manufacturer | |
---|---|---|---|---|
0 | 1 | Hard drive | 240 | 5 |
1 | 2 | Memory | 120 | 6 |
2 | 3 | ZIP drive | 150 | 4 |
3 | 4 | Floppy disk | 5 | 6 |
4 | 5 | Monitor | 240 | 1 |
5 | 6 | DVD drive | 180 | 2 |
6 | 7 | CD drive | 90 | 2 |
7 | 8 | Printer | 270 | 3 |
8 | 9 | Toner cartridge | 66 | 3 |
9 | 10 | DVD burner | 180 | 2 |
10 | 11 | Loudspeakers | 70 | 2 |
dfp
code | name | price | manufacturer | |
---|---|---|---|---|
0 | 1 | Hard drive | 240 | 5 |
1 | 2 | Memory | 120 | 6 |
2 | 3 | ZIP drive | 150 | 4 |
3 | 4 | Floppy disk | 5 | 6 |
4 | 5 | Monitor | 240 | 1 |
5 | 6 | DVD drive | 180 | 2 |
6 | 7 | CD drive | 90 | 2 |
7 | 8 | Printer | 270 | 3 |
8 | 9 | Toner cartridge | 66 | 3 |
9 | 10 | DVD burner | 180 | 2 |
10 | 11 | Loudspeakers | 70 | 2 |
dfp.loc[dfp.code==8,'name'] = 'Laser Printer'
dfp
code | name | price | manufacturer | |
---|---|---|---|---|
0 | 1 | Hard drive | 240 | 5 |
1 | 2 | Memory | 120 | 6 |
2 | 3 | ZIP drive | 150 | 4 |
3 | 4 | Floppy disk | 5 | 6 |
4 | 5 | Monitor | 240 | 1 |
5 | 6 | DVD drive | 180 | 2 |
6 | 7 | CD drive | 90 | 2 |
7 | 8 | Laser Printer | 270 | 3 |
8 | 9 | Toner cartridge | 66 | 3 |
9 | 10 | DVD burner | 180 | 2 |
10 | 11 | Loudspeakers | 70 | 2 |
%%sql
SELECT *
FROM Products;
code | name | price | manufacturer | |
---|---|---|---|---|
0 | 1 | Hard drive | 240 | 5 |
1 | 2 | Memory | 120 | 6 |
2 | 3 | ZIP drive | 150 | 4 |
3 | 4 | Floppy disk | 5 | 6 |
4 | 5 | Monitor | 240 | 1 |
5 | 6 | DVD drive | 180 | 2 |
6 | 7 | CD drive | 90 | 2 |
7 | 8 | Printer | 270 | 3 |
8 | 9 | Toner cartridge | 66 | 3 |
9 | 10 | DVD burner | 180 | 2 |
10 | 11 | Loudspeakers | 70 | 2 |
%%sql
UPDATE Products
SET name = 'Laser Printer'
WHERE code = 8;
%%sql
SELECT *
FROM Products;
code | name | price | manufacturer | |
---|---|---|---|---|
0 | 1 | Hard drive | 240 | 5 |
1 | 2 | Memory | 120 | 6 |
2 | 3 | ZIP drive | 150 | 4 |
3 | 4 | Floppy disk | 5 | 6 |
4 | 5 | Monitor | 240 | 1 |
5 | 6 | DVD drive | 180 | 2 |
6 | 7 | CD drive | 90 | 2 |
7 | 9 | Toner cartridge | 66 | 3 |
8 | 10 | DVD burner | 180 | 2 |
9 | 11 | Loudspeakers | 70 | 2 |
10 | 8 | Laser Printer | 270 | 3 |
dfp['price'] *= 0.9
dfp
code | name | price | manufacturer | |
---|---|---|---|---|
0 | 1 | Hard drive | 216.0 | 5 |
1 | 2 | Memory | 108.0 | 6 |
2 | 3 | ZIP drive | 135.0 | 4 |
3 | 4 | Floppy disk | 4.5 | 6 |
4 | 5 | Monitor | 216.0 | 1 |
5 | 6 | DVD drive | 162.0 | 2 |
6 | 7 | CD drive | 81.0 | 2 |
7 | 8 | Laser Printer | 243.0 | 3 |
8 | 9 | Toner cartridge | 59.4 | 3 |
9 | 10 | DVD burner | 162.0 | 2 |
10 | 11 | Loudspeakers | 63.0 | 2 |
%%sql
SELECT *
FROM Products;
code | name | price | manufacturer | |
---|---|---|---|---|
0 | 1 | Hard drive | 240 | 5 |
1 | 2 | Memory | 120 | 6 |
2 | 3 | ZIP drive | 150 | 4 |
3 | 4 | Floppy disk | 5 | 6 |
4 | 5 | Monitor | 240 | 1 |
5 | 6 | DVD drive | 180 | 2 |
6 | 7 | CD drive | 90 | 2 |
7 | 9 | Toner cartridge | 66 | 3 |
8 | 10 | DVD burner | 180 | 2 |
9 | 11 | Loudspeakers | 70 | 2 |
10 | 8 | Laser Printer | 270 | 3 |
%%sql
UPDATE Products
SET price = price * 0.9;
%%sql
SELECT *
FROM Products;
code | name | price | manufacturer | |
---|---|---|---|---|
0 | 1 | Hard drive | 216.0 | 5 |
1 | 2 | Memory | 108.0 | 6 |
2 | 3 | ZIP drive | 135.0 | 4 |
3 | 4 | Floppy disk | 4.5 | 6 |
4 | 5 | Monitor | 216.0 | 1 |
5 | 6 | DVD drive | 162.0 | 2 |
6 | 7 | CD drive | 81.0 | 2 |
7 | 9 | Toner cartridge | 59.4 | 3 |
8 | 10 | DVD burner | 162.0 | 2 |
9 | 11 | Loudspeakers | 63.0 | 2 |
10 | 8 | Laser Printer | 243.0 | 3 |
dfp
code | name | price | manufacturer | |
---|---|---|---|---|
0 | 1 | Hard drive | 216.0 | 5 |
1 | 2 | Memory | 108.0 | 6 |
2 | 3 | ZIP drive | 135.0 | 4 |
3 | 4 | Floppy disk | 4.5 | 6 |
4 | 5 | Monitor | 216.0 | 1 |
5 | 6 | DVD drive | 162.0 | 2 |
6 | 7 | CD drive | 81.0 | 2 |
7 | 8 | Laser Printer | 243.0 | 3 |
8 | 9 | Toner cartridge | 59.4 | 3 |
9 | 10 | DVD burner | 162.0 | 2 |
10 | 11 | Loudspeakers | 63.0 | 2 |
dfp.loc[dfp.price >= 120, 'price'] *= 0.9
dfp
code | name | price | manufacturer | |
---|---|---|---|---|
0 | 1 | Hard drive | 194.4 | 5 |
1 | 2 | Memory | 108.0 | 6 |
2 | 3 | ZIP drive | 121.5 | 4 |
3 | 4 | Floppy disk | 4.5 | 6 |
4 | 5 | Monitor | 194.4 | 1 |
5 | 6 | DVD drive | 145.8 | 2 |
6 | 7 | CD drive | 81.0 | 2 |
7 | 8 | Laser Printer | 218.7 | 3 |
8 | 9 | Toner cartridge | 59.4 | 3 |
9 | 10 | DVD burner | 145.8 | 2 |
10 | 11 | Loudspeakers | 63.0 | 2 |
%%sql
SELECT *
FROM Products;
code | name | price | manufacturer | |
---|---|---|---|---|
0 | 1 | Hard drive | 216.0 | 5 |
1 | 2 | Memory | 108.0 | 6 |
2 | 3 | ZIP drive | 135.0 | 4 |
3 | 4 | Floppy disk | 4.5 | 6 |
4 | 5 | Monitor | 216.0 | 1 |
5 | 6 | DVD drive | 162.0 | 2 |
6 | 7 | CD drive | 81.0 | 2 |
7 | 9 | Toner cartridge | 59.4 | 3 |
8 | 10 | DVD burner | 162.0 | 2 |
9 | 11 | Loudspeakers | 63.0 | 2 |
10 | 8 | Laser Printer | 243.0 | 3 |
%%sql
UPDATE Products
SET price = price * 0.9
WHERE price >= 120;
%%sql
SELECT *
FROM Products;
code | name | price | manufacturer | |
---|---|---|---|---|
0 | 2 | Memory | 108.0 | 6 |
1 | 4 | Floppy disk | 4.5 | 6 |
2 | 7 | CD drive | 81.0 | 2 |
3 | 9 | Toner cartridge | 59.4 | 3 |
4 | 11 | Loudspeakers | 63.0 | 2 |
5 | 1 | Hard drive | 194.40 | 5 |
6 | 3 | ZIP drive | 121.50 | 4 |
7 | 5 | Monitor | 194.40 | 1 |
8 | 6 | DVD drive | 145.80 | 2 |
9 | 10 | DVD burner | 145.80 | 2 |
10 | 8 | Laser Printer | 218.70 | 3 |
time_taken = time.time() - time_start_notebook
h,m = divmod(time_taken,60*60)
print('Time taken to run whole notebook: {:.0f} hr '\
'{:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))
Time taken to run whole notebook: 0 hr 0 min 2 secs