import time
import os
time_start_notebook = time.time()


# my local library
import sys
sys.path.append("/Users/poudel/Dropbox/a00_Bhishan_Modules/")
sys.path.append("/Users/poudel/Dropbox/a00_Bhishan_Modules/bhishan")
from bhishan import bp


import numpy as np
import pandas as pd

import findspark
import pyspark
import psycopg2

%load_ext watermark
%watermark -a "Bhishan Poudel" -d -v -m
print()
%watermark -iv

Author: Bhishan Poudel

Python implementation: CPython
Python version       : 3.7.7
IPython version      : 7.16.1

Compiler    : Clang 4.0.1 (tags/RELEASE_401/final)
OS          : Darwin
Release     : 19.6.0
Machine     : x86_64
Processor   : i386
CPU cores   : 4
Architecture: 64bit


pandas   : 1.0.5
findspark: 1.4.2
pyspark  : 3.1.2
sys      : 3.7.7 (default, May  6 2020, 04:59:01) 
[Clang 4.0.1 (tags/RELEASE_401/final)]
psycopg2 : 2.8.5
numpy    : 1.17.5
bhishan  : 0.4


from datetime import datetime
from dateutil.relativedelta import relativedelta
# last_year = datetime.now() - relativedelta(years=1)


# pyspark settings
import os
import findspark

HOME = os.path.expanduser('~')
findspark.init(HOME + "/Softwares/Spark/spark-3.1.2-bin-hadoop3.2")

# Set spark environments
os.environ['PYSPARK_PYTHON'] = f'{HOME}/opt/miniconda3/envs/spk/bin/python'
os.environ['PYSPARK_DRIVER_PYTHON'] = f'{HOME}/opt/miniconda3/envs/spk/bin/python'


import pyspark
from pyspark import SparkConf, SparkContext, SQLContext
from pyspark.sql.session import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.functions import col as _col
from pyspark.sql.functions import udf # @udf("integer") def myfunc(x,y): return x - y
from pyspark.sql import functions as F # stddev format_number date_format, dayofyear, when
from pyspark.sql.window import Window
from pyspark.sql.types import StructField, StringType, IntegerType,  FloatType, StructType
from pyspark.sql.functions import (mean as _mean, min as _min,
                                   max as _max, avg as _avg,
                                   when as _when
                                  )


spark = SparkSession\
        .builder\
        .appName("bhishan")\
        .getOrCreate()

sc = spark.sparkContext
sqlContext = pyspark.SQLContext(sc) # spark_df = sqlContext.createDataFrame(pandas_df)
sc.setLogLevel("INFO")
print(pyspark.__version__)

3.1.2


from pyspark import SparkConf, SparkContext, SQLContext

spark = pyspark.sql.SparkSession.builder.appName('app').getOrCreate()
sc = spark.sparkContext
sqlContext = SQLContext(sc) # spark_df = sqlContext.createDataFrame(pandas_df)
sc.setLogLevel("INFO")


# postgres configs
import os, json
data = json.load(open(os.path.expanduser('~/.config/postgres/credentials.json')))
pw,port = data['password'], data['port'] # port is 5432
dbname = 'wbmovie' # make sure pgadmin is running and given database is there.

%load_ext sql
%sql postgres://postgres:$pw@localhost:$port/$dbname

The sql extension is already loaded. To reload it, use:
  %reload_ext sql


def f(q,n=None):
    return pd.read_sql(q,con).head(n)

def s(q,n=5):
    sdf = spark.sql(q)
    print(f'nrows: {sdf.count()}, ncols: {len(sdf.columns)}')
    return sdf.show(n)

def postgres_to_dataframe(query,con,n=None):
    """
    Get pandas dataframe from sql query using connection and cursor.
    """
    cur = con.cursor()
    try:
        cur.execute(query)
    except (Exception, psycopg2.DatabaseError) as error:
        print("Error: %s" % error)
        cur.close()
        return 1

    fields = [i[0] for i in cur.description]
    rows = cur.fetchall()
    cur.close()

    # Create pandas dataframe
    df = pd.DataFrame(rows, columns=fields)
    return df.head(n)


%%sql

DROP TABLE IF EXISTS Movies CASCADE;


DROP TABLE IF EXISTS MovieTheaters CASCADE;


CREATE TABLE Movies (Code INTEGER PRIMARY KEY NOT NULL,
                                              Title TEXT NOT NULL,
                                                         Rating TEXT);


CREATE TABLE MovieTheaters (Code INTEGER PRIMARY KEY NOT NULL,
                                                     Name TEXT NOT NULL,
                                                               Movie INTEGER CONSTRAINT fk_Movies_Code REFERENCES Movies(Code));


INSERT INTO Movies(Code, Title, Rating)
VALUES(9,'Citizen King','G');


INSERT INTO Movies(Code, Title, Rating)
VALUES(1,'Citizen Kane','PG');


INSERT INTO Movies(Code, Title, Rating)
VALUES(2,'Singin'' in the Rain','G');


INSERT INTO Movies(Code, Title, Rating)
VALUES(3,'The Wizard of Oz','G');


INSERT INTO Movies(Code, Title, Rating)
VALUES(4,'The Quiet Man',NULL);


INSERT INTO Movies(Code, Title, Rating)
VALUES(5,'North by Northwest',NULL);


INSERT INTO Movies(Code, Title, Rating)
VALUES(6,'The Last Tango in Paris','NC-17');


INSERT INTO Movies(Code, Title, Rating)
VALUES(7,'Some Like it Hot','PG-13');


INSERT INTO Movies(Code, Title, Rating)
VALUES(8,'A Night at the Opera',NULL);


INSERT INTO MovieTheaters(Code, Name, Movie)
VALUES(1,'Odeon',5);


INSERT INTO MovieTheaters(Code, Name, Movie)
VALUES(2,'Imperial',1);


INSERT INTO MovieTheaters(Code, Name, Movie)
VALUES(3,'Majestic',NULL);


INSERT INTO MovieTheaters(Code, Name, Movie)
VALUES(4,'Royale',6);


INSERT INTO MovieTheaters(Code, Name, Movie)
VALUES(5,'Paraiso',3);


INSERT INTO MovieTheaters(Code, Name, Movie)
VALUES(6,'Nickelodeon',NULL);


SELECT *
FROM movies;

 * postgres://postgres:***@localhost:5432/wbmovie
Done.
Done.
Done.
Done.
1 rows affected.
1 rows affected.
1 rows affected.
1 rows affected.
1 rows affected.
1 rows affected.
1 rows affected.
1 rows affected.
1 rows affected.
1 rows affected.
1 rows affected.
1 rows affected.
1 rows affected.
1 rows affected.
1 rows affected.
9 rows affected.


import psycopg2

con = psycopg2.connect(host = "localhost",database=dbname,
                       user = "postgres",password = pw)


cur = con.cursor()
cur.execute("select * from Movies limit 2;")

fields = [i[0] for i in cur.description]
rows = cur.fetchall()

for row in rows:
    print({f:r for f,r in zip(fields,row)})
    print()

cur.close()

{'code': 9, 'title': 'Citizen King', 'rating': 'G'}

{'code': 1, 'title': 'Citizen Kane', 'rating': 'PG'}


query = "select * from Movies limit 2;"
postgres_to_dataframe(query,con)


con.close()


x = %sql select * from Movies;

dfm = x.DataFrame()
dfm

 * postgres://postgres:***@localhost:5432/wbmovie
9 rows affected.


dfm.dtypes

code       int64
title     object
rating    object
dtype: object


schema = StructType([
    StructField('code',IntegerType(),True),
    StructField('title',StringType(),True),
    StructField('rating',StringType(),True),
    ])

sdfm = sqlContext.createDataFrame(dfm, schema)
sdfm.show()

+----+--------------------+------+
|code|               title|rating|
+----+--------------------+------+
|   9|        Citizen King|     G|
|   1|        Citizen Kane|    PG|
|   2| Singin' in the Rain|     G|
|   3|    The Wizard of Oz|     G|
|   4|       The Quiet Man|  null|
|   5|  North by Northwest|  null|
|   6|The Last Tango in...| NC-17|
|   7|    Some Like it Hot| PG-13|
|   8|A Night at the Opera|  null|
+----+--------------------+------+


x = %sql select * from MovieTheaters;

dft = x.DataFrame()
dft

 * postgres://postgres:***@localhost:5432/wbmovie
6 rows affected.


dft.dtypes

code       int64
name      object
movie    float64
dtype: object


schema = StructType([
    StructField('code',IntegerType(),True),
    StructField('name',StringType(),True),
    StructField('movie',FloatType(),True),
    ])

sdft = sqlContext.createDataFrame(dft, schema)
sdft.show()

+----+-----------+-----+
|code|       name|movie|
+----+-----------+-----+
|   1|      Odeon|  5.0|
|   2|   Imperial|  1.0|
|   3|   Majestic|  NaN|
|   4|     Royale|  6.0|
|   5|    Paraiso|  3.0|
|   6|Nickelodeon|  NaN|
+----+-----------+-----+


# sdft.printSchema()


dfm.head(1)


dft.head(1)


df = dfm.merge(dft,left_on='code', right_on='movie',suffixes=['_m','_t'])
df


cols1 = sdft.columns
cols2  = sdfm.columns
cols_common = set(cols1) & set(cols2)
cols1, cols2, cols_common

(['code', 'name', 'movie'], ['code', 'title', 'rating'], {'code'})


# rename common columns of first spark dataframe.


sdfm2 = sdfm.withColumnRenamed('code','code_m')
sdfm2.show(2)

+------+------------+------+
|code_m|       title|rating|
+------+------------+------+
|     9|Citizen King|     G|
|     1|Citizen Kane|    PG|
+------+------------+------+
only showing top 2 rows


# rename common columns of second spark dataframe.


sdft2 = sdft.withColumnRenamed('code','code_t')
sdft2.show(2)

+------+--------+-----+
|code_t|    name|movie|
+------+--------+-----+
|     1|   Odeon|  5.0|
|     2|Imperial|  1.0|
+------+--------+-----+
only showing top 2 rows


# now join two spark dataframes which have different column names.


sdf = sdfm2.join(sdft2,sdfm2.code_m==sdft2.movie)
sdf.show()

+------+--------------------+------+------+--------+-----+
|code_m|               title|rating|code_t|    name|movie|
+------+--------------------+------+------+--------+-----+
|     5|  North by Northwest|  null|     1|   Odeon|  5.0|
|     3|    The Wizard of Oz|     G|     5| Paraiso|  3.0|
|     1|        Citizen Kane|    PG|     2|Imperial|  1.0|
|     6|The Last Tango in...| NC-17|     4|  Royale|  6.0|
+------+--------------------+------+------+--------+-----+


bp.show_methods(sdf,4)
# count, fillna, filter, where, groubpy, first, corr,
# missing than pandas: max, min, sum, etc.


# Register the DataFrame as a SQL temporary view
sdfm.createOrReplaceTempView("Movies")
sdft.createOrReplaceTempView("MovieTheaters")
sdf.createOrReplaceTempView("sdf")

spark.sql('select * from Movies limit 2').show()

+----+------------+------+
|code|       title|rating|
+----+------------+------+
|   9|Citizen King|     G|
|   1|Citizen Kane|    PG|
+----+------------+------+


# dfm['Title']


# sdfm.select('Title').show()


%%sql

SELECT title
FROM Movies;

 * postgres://postgres:***@localhost:5432/wbmovie
9 rows affected.


q = "select title from Movies;"
s(q)

nrows: 9, ncols: 1
+-------------------+
|              title|
+-------------------+
|       Citizen King|
|       Citizen Kane|
|Singin' in the Rain|
|   The Wizard of Oz|
|      The Quiet Man|
+-------------------+
only showing top 5 rows


dfm['rating'].unique()

array(['G', 'PG', None, 'NC-17', 'PG-13'], dtype=object)


sdfm.select('rating').distinct().show()

+------+
|rating|
+------+
|  null|
|    PG|
| NC-17|
|     G|
| PG-13|
+------+


%%sql

SELECT DISTINCT rating
FROM Movies;

 * postgres://postgres:***@localhost:5432/wbmovie
5 rows affected.


s(q)

nrows: 9, ncols: 1
+-------------------+
|              title|
+-------------------+
|       Citizen King|
|       Citizen Kane|
|Singin' in the Rain|
|   The Wizard of Oz|
|      The Quiet Man|
+-------------------+
only showing top 5 rows


dfm[dfm.rating.isnull()]


sdfm.filter(sdfm.rating.isNull()).show()

+----+--------------------+------+
|code|               title|rating|
+----+--------------------+------+
|   4|       The Quiet Man|  null|
|   5|  North by Northwest|  null|
|   8|A Night at the Opera|  null|
+----+--------------------+------+


%%sql

SELECT *
FROM Movies
WHERE Rating IS NULL;

 * postgres://postgres:***@localhost:5432/wbmovie
3 rows affected.


q = "select * from Movies where Rating is null"
s(q)

nrows: 3, ncols: 3
+----+--------------------+------+
|code|               title|rating|
+----+--------------------+------+
|   4|       The Quiet Man|  null|
|   5|  North by Northwest|  null|
|   8|A Night at the Opera|  null|
+----+--------------------+------+


dft[dft.movie.isnull()]


sdft.filter(sdft.movie.isNull()).show() # fails, gives empty result
sdft.where(F.isnull('movie')).show() # gives empty result

+----+----+-----+
|code|name|movie|
+----+----+-----+
+----+----+-----+

+----+----+-----+
|code|name|movie|
+----+----+-----+
+----+----+-----+


spark.sql("""select * from MovieTheaters where movie = 'NaN' """).show()

+----+-----------+-----+
|code|       name|movie|
+----+-----------+-----+
|   3|   Majestic|  NaN|
|   6|Nickelodeon|  NaN|
+----+-----------+-----+


spark.sql("select * from MovieTheaters where isnan(movie)").show()

+----+-----------+-----+
|code|       name|movie|
+----+-----------+-----+
|   3|   Majestic|  NaN|
|   6|Nickelodeon|  NaN|
+----+-----------+-----+


%%sql

SELECT *
FROM MovieTheaters
WHERE movie IS NULL;

 * postgres://postgres:***@localhost:5432/wbmovie
2 rows affected.


q = 'select * from MovieTheaters where movie is null'
s(q)

nrows: 0, ncols: 3
+----+----+-----+
|code|name|movie|
+----+----+-----+
+----+----+-----+


dft.head(1)


dfm.head(1)


dft.merge(dfm,left_on='movie',right_on='code',how='left',suffixes=['_t','_m'])
# left join also include nulls.


(sdft2
 .join(sdfm2,
       sdfm2.code_m==sdft2.movie,
       how='left'
      )

 .show()
)

+------+-----------+-----+------+--------------------+------+
|code_t|       name|movie|code_m|               title|rating|
+------+-----------+-----+------+--------------------+------+
|     1|      Odeon|  5.0|     5|  North by Northwest|  null|
|     5|    Paraiso|  3.0|     3|    The Wizard of Oz|     G|
|     2|   Imperial|  1.0|     1|        Citizen Kane|    PG|
|     4|     Royale|  6.0|     6|The Last Tango in...| NC-17|
|     3|   Majestic|  NaN|  null|                null|  null|
|     6|Nickelodeon|  NaN|  null|                null|  null|
+------+-----------+-----+------+--------------------+------+


%%sql

SELECT *
FROM MovieTheaters t
LEFT JOIN Movies m ON m.code = t.movie;

 * postgres://postgres:***@localhost:5432/wbmovie
6 rows affected.


q = """
select *
from MovieTheaters t
left join Movies m
on m.code = t.movie
"""

s(q,6)

nrows: 6, ncols: 6
+----+-----------+-----+----+--------------------+------+
|code|       name|movie|code|               title|rating|
+----+-----------+-----+----+--------------------+------+
|   1|      Odeon|  5.0|   5|  North by Northwest|  null|
|   5|    Paraiso|  3.0|   3|    The Wizard of Oz|     G|
|   2|   Imperial|  1.0|   1|        Citizen Kane|    PG|
|   4|     Royale|  6.0|   6|The Last Tango in...| NC-17|
|   3|   Majestic|  NaN|null|                null|  null|
|   6|Nickelodeon|  NaN|null|                null|  null|
+----+-----------+-----+----+--------------------+------+


%%sql

SELECT *
FROM MovieTheaters t
RIGHT JOIN Movies m ON m.code=t.movie;

 * postgres://postgres:***@localhost:5432/wbmovie
9 rows affected.


q = """
select *
from MovieTheaters t
right join Movies m
on m.code=t.movie
"""

s(q,9)

nrows: 9, ncols: 6
+----+--------+-----+----+--------------------+------+
|code|    name|movie|code|               title|rating|
+----+--------+-----+----+--------------------+------+
|null|    null| null|   9|        Citizen King|     G|
|   1|   Odeon|  5.0|   5|  North by Northwest|  null|
|null|    null| null|   7|    Some Like it Hot| PG-13|
|null|    null| null|   2| Singin' in the Rain|     G|
|   5| Paraiso|  3.0|   3|    The Wizard of Oz|     G|
|   2|Imperial|  1.0|   1|        Citizen Kane|    PG|
|   4|  Royale|  6.0|   6|The Last Tango in...| NC-17|
|null|    null| null|   8|A Night at the Opera|  null|
|null|    null| null|   4|       The Quiet Man|  null|
+----+--------+-----+----+--------------------+------+

dfm

dft


# I see only moives 1,3,5,6 are showing in theaters.
# movies 2,4,7-9 are not being shown in any theater.

# 2	Singin' in the Rain	G
# 4	The Quiet Man	None


%%sql

SELECT title
FROM Movies m
WHERE code NOT IN
    (SELECT movie
     FROM MovieTheaters
     WHERE movie IS NOT NULL );

 * postgres://postgres:***@localhost:5432/wbmovie
5 rows affected.


q = """
select *
from Movies m
left join MovieTheaters t
on t.movie = m.code
where t.movie is null
"""

s(q,6)

nrows: 5, ncols: 6
+----+--------------------+------+----+----+-----+
|code|               title|rating|code|name|movie|
+----+--------------------+------+----+----+-----+
|   9|        Citizen King|     G|null|null| null|
|   7|    Some Like it Hot| PG-13|null|null| null|
|   2| Singin' in the Rain|     G|null|null| null|
|   8|A Night at the Opera|  null|null|null| null|
|   4|       The Quiet Man|  null|null|null| null|
+----+--------------------+------+----+----+-----+


%%sql

SELECT *
FROM Movies;

 * postgres://postgres:***@localhost:5432/wbmovie
9 rows affected.


%%sql

INSERT INTO Movies(code, Title, Rating)
VALUES(10, 'One, Two, Three',NULL);

 * postgres://postgres:***@localhost:5432/wbmovie
1 rows affected.

[]


%%sql

SELECT *
FROM Movies;

 * postgres://postgres:***@localhost:5432/wbmovie
10 rows affected.


%%sql

SELECT *
FROM Movies;

 * postgres://postgres:***@localhost:5432/wbmovie
10 rows affected.


%%sql

UPDATE Movies
SET Rating='G'
WHERE Rating IS NULL;


SELECT *
FROM Movies;

 * postgres://postgres:***@localhost:5432/wbmovie
4 rows affected.
10 rows affected.


%%sql

SELECT *
FROM Movies;

 * postgres://postgres:***@localhost:5432/wbmovie
10 rows affected.


%%sql

DELETE
FROM MovieTheaters
WHERE Movie IN
    (SELECT Code
     FROM Movies
     WHERE Rating = 'NC-17');

 * postgres://postgres:***@localhost:5432/wbmovie
1 rows affected.

[]


%%sql

SELECT *
FROM Movies;

 * postgres://postgres:***@localhost:5432/wbmovie
10 rows affected.


time_taken = time.time() - time_start_notebook
h,m = divmod(time_taken,60*60)
print('Time taken: {:.0f} hr {:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))

Time taken: 0 hr 0 min 26 secs

	0	1	2	3
0	agg	drop_duplicates	localCheckpoint	sql_ctx
1	alias	dropna	mapInPandas	stat
2	approxQuantile	dtypes	na	storageLevel
3	cache	exceptAll	orderBy	subtract
4	checkpoint	explain	persist	summary
5	coalesce	fillna	printSchema	tail
6	colRegex	filter	randomSplit	take
7	collect	first	rdd	toDF
8	columns	foreach	registerTempTable	toJSON
9	corr	foreachPartition	repartition	toLocalIterator
10	count	freqItems	repartitionByRange	toPandas
11	cov	groupBy	replace	transform
12	createGlobalTempView	groupby	rollup	union
13	createOrReplaceGlobalTempView	head	sameSemantics	unionAll
14	createOrReplaceTempView	hint	sample	unionByName
15	createTempView	inputFiles	sampleBy	unpersist
16	crossJoin	intersect	schema	where
17	crosstab	intersectAll	select	withColumn
18	cube	isLocal	selectExpr	withColumnRenamed
19	describe	isStreaming	semanticHash	withWatermark
20	distinct	is_cached	show	write
21	drop	join	sort	writeStream
22	dropDuplicates	limit	sortWithinPartitions	writeTo

Table of Contents

Wikibooks SQL Exercise: Movie Theaters¶

Load the libraries¶

Connect to database using postgres¶

Utility Functions¶

Create Tables¶

Connect to postgres using psycopg2¶

Pandas and Spark dataframes¶

data: Movies¶

data: MovieTheaters¶

data: combined data¶

merge two dataframes using pyspark¶

spark dataframe attributes¶

Create SQL table from spark dataframes¶

Exercise Questions¶

1. Select the title of all movies.¶

2. Show all the distinct ratings in the database.¶

3. Show all unrated movies.¶

4. Select all movie theaters that are not currently showing a movie.¶

5. Select all data from all movie theaters and, additionally, the data from the movie that is being shown in the theater (if one is being shown).¶

6. Select all data from all movies and, if that movie is being shown in a theater, show the data from the theater.¶

7. Show the titles of movies not currently being shown in any theaters.¶

8. Add the unrated movie "One, Two, Three".¶

9. Set the rating of all unrated movies to "G".¶

10. Remove movie theaters projecting movies rated "NC-17".¶

Time Taken¶

code	title	rating
9	Citizen King	G
1	Citizen Kane	PG
2	Singin' in the Rain	G
3	The Wizard of Oz	G
4	The Quiet Man	None
5	North by Northwest	None
6	The Last Tango in Paris	NC-17
7	Some Like it Hot	PG-13
8	A Night at the Opera	None

	code	name	movie
0	1	Odeon	5.0
1	2	Imperial	1.0
2	3	Majestic	NaN
3	4	Royale	6.0
4	5	Paraiso	3.0
5	6	Nickelodeon	NaN