# setup pyspark
# note: there is no .precision in pyspark
# sdf.select(F.format_number('mycol',2,',').alias('formatted_number')).show()
import pyspark
spark = pyspark.sql.SparkSession.builder.getOrCreate()
sc = spark.sparkContext
sqlContext = pyspark.SQLContext(sc) # spark_df = sqlContext.createDataFrame(pandas_df)
sc.setLogLevel('INFO')
# get table names in database
sqlContext.sql("show tables in datascience").filter("tableName like '%bp' ").toPandas()
Useful Python Codes
import numpy as np
import pandas as pd
from datetime import date, datetime
# datetime addition
nextday = datetime.today() + pd.DateOffset(days=1)
# get multi-index column names
df.columns.get_level_values(1)
# filter some values
df.iloc[0,df.columns.get_loc(mycol)]
# style
df.style.highlight_min(axis=1)
# pivot table (groupby/unstack is faster than pivot)
# the values of column 'col_pivot' will go to columns
df.pivot_table(index=cols_idx,values=col_val,columns=col_pivot)
df.groupby(['A','B',col_pivot])[col_val].sum().unstack(col_pivot).reset_index().rename_axis(None,axis=1)
Useful Notes and Caveats
Useful Notes
Pandas needs openpyxl,pytables,pyarrow to read/write files.
# Pandas dt vs isocalendar
# isocalendar always starts from Monday, not the actual start.
pd.Timestamp('2022-01-01').year # 2022
pd.Timestamp('2022-01-01').isocalendar().year # 2021 (iso calendar starts from Monday)
Troubleshoots
PIP ssl error
# if we get ssl verification of pip, we can use following method:
# we also can use full path of python -m pip and rest.
pip install --trusted-host pypi.org --trusted-host pypi.python.org --trusted-host files.pythonhosted.org openpyxl
Blogs
Dec 2021
Date: Dec 15, 2021 Topic: pandas validation
Today, I came across this awesome python module called
pandera.
It is a data validation library.
Examples: We can check for null values, check for negative values, check for hypothesis testing and so on.
Date: Dec 13, 2021 Topic: sort a list using two keys
Today, I learned how to sort a list using two keys.