Adopted from https://www.youtube.com/playlist?list=PLINDUevGdb7U_ZRLCpqKWutmcY0vCUFOI
See Polars Tutorial2 for more.
Reading CSV Files
import polars as pl
# df = pl.read_csv('pl_data.csv')
Creating New Columns
df = pl.DataFrame({'Name': ['Mario', 'Luigi', 'Wario', 'Mario', 'Mario'], 'Age': [30,28,26,30,30]})
print(df)
shape: (5, 2)
┌───────┬─────┐
│ Name ┆ Age │
│ --- ┆ --- │
│ str ┆ i64 │
╞═══════╪═════╡
│ Mario ┆ 30 │
│ Luigi ┆ 28 │
│ Wario ┆ 26 │
│ Mario ┆ 30 │
│ Mario ┆ 30 │
└───────┴─────┘
score_values = pl.Series([95,99,94,90,96])
df = df.with_columns(Score = pl.lit(score_values),
Score_x2 = pl.lit(score_values*2))
print(df)
shape: (5, 4)
┌───────┬─────┬───────┬──────────┐
│ Name ┆ Age ┆ Score ┆ Score_x2 │
│ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ i64 ┆ i64 ┆ i64 │
╞═══════╪═════╪═══════╪══════════╡
│ Mario ┆ 30 ┆ 95 ┆ 190 │
│ Luigi ┆ 28 ┆ 99 ┆ 198 │
│ Wario ┆ 26 ┆ 94 ┆ 188 │
│ Mario ┆ 30 ┆ 90 ┆ 180 │
│ Mario ┆ 30 ┆ 96 ┆ 192 │
└───────┴─────┴───────┴──────────┘
Dropping Columns
print(df.drop('Score_x2'))
print(df.drop('Score_x2', 'Score'))
print(df.drop(['Score_x2', 'Score']))
df.drop_in_place('Score_x2')
print(df)
shape: (5, 3)
┌───────┬─────┬───────┐
│ Name ┆ Age ┆ Score │
│ --- ┆ --- ┆ --- │
│ str ┆ i64 ┆ i64 │
╞═══════╪═════╪═══════╡
│ Mario ┆ 30 ┆ 95 │
│ Luigi ┆ 28 ┆ 99 │
│ Wario ┆ 26 ┆ 94 │
│ Mario ┆ 30 ┆ 90 │
│ Mario ┆ 30 ┆ 96 │
└───────┴─────┴───────┘
shape: (5, 2)
┌───────┬─────┐
│ Name ┆ Age │
│ --- ┆ --- │
│ str ┆ i64 │
╞═══════╪═════╡
│ Mario ┆ 30 │
│ Luigi ┆ 28 │
│ Wario ┆ 26 │
│ Mario ┆ 30 │
│ Mario ┆ 30 │
└───────┴─────┘
shape: (5, 2)
┌───────┬─────┐
│ Name ┆ Age │
│ --- ┆ --- │
│ str ┆ i64 │
╞═══════╪═════╡
│ Mario ┆ 30 │
│ Luigi ┆ 28 │
│ Wario ┆ 26 │
│ Mario ┆ 30 │
│ Mario ┆ 30 │
└───────┴─────┘
shape: (5, 3)
┌───────┬─────┬───────┐
│ Name ┆ Age ┆ Score │
│ --- ┆ --- ┆ --- │
│ str ┆ i64 ┆ i64 │
╞═══════╪═════╪═══════╡
│ Mario ┆ 30 ┆ 95 │
│ Luigi ┆ 28 ┆ 99 │
│ Wario ┆ 26 ┆ 94 │
│ Mario ┆ 30 ┆ 90 │
│ Mario ┆ 30 ┆ 96 │
└───────┴─────┴───────┘
Drop Duplicate Rows
df = pl.DataFrame({'Name': ['Mario', 'Luigi', 'Wario', 'Mario', 'Mario'], 'Age': [30,28,26,30,30]})
print(df.unique())
shape: (3, 2)
┌───────┬─────┐
│ Name ┆ Age │
│ --- ┆ --- │
│ str ┆ i64 │
╞═══════╪═════╡
│ Mario ┆ 30 │
│ Luigi ┆ 28 │
│ Wario ┆ 26 │
└───────┴─────┘
Iterating Polars Dataframes
for row in df.rows():
print(row)
print(row[0])
('Mario', 30)
Mario
('Luigi', 28)
Luigi
('Wario', 26)
Wario
('Mario', 30)
Mario
('Mario', 30)
Mario
Plot Polars Dataframe
import matplotlib.pyplot as plt
df = pl.DataFrame({'Date': ['1/1/2023','1/2/2023','1/3/2023','1/4/2023','1/5/2023','1/6/2023','1/7/2023','1/8/2023','1/9/2023','1/10/2023'],
'Price': [15,16,16,15,14,13,14,17,16,18]})
dates = list(df['Date'])
prices = list(df['Price'])
plt.plot(dates, prices)
plt.show()
Converting Polars Dataframe To Pandas
This operation requires that both pandas
and pyarrow
are installed.
x = df.to_pandas()
print(type(df))
print(type(x))
print(df)
print(x)
<class 'polars.dataframe.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
shape: (10, 2)
┌───────────┬───────┐
│ Date ┆ Price │
│ --- ┆ --- │
│ str ┆ i64 │
╞═══════════╪═══════╡
│ 1/1/2023 ┆ 15 │
│ 1/2/2023 ┆ 16 │
│ 1/3/2023 ┆ 16 │
│ 1/4/2023 ┆ 15 │
│ 1/5/2023 ┆ 14 │
│ 1/6/2023 ┆ 13 │
│ 1/7/2023 ┆ 14 │
│ 1/8/2023 ┆ 17 │
│ 1/9/2023 ┆ 16 │
│ 1/10/2023 ┆ 18 │
└───────────┴───────┘
Date Price
0 1/1/2023 15
1 1/2/2023 16
2 1/3/2023 16
3 1/4/2023 15
4 1/5/2023 14
5 1/6/2023 13
6 1/7/2023 14
7 1/8/2023 17
8 1/9/2023 16
9 1/10/2023 18
Filtering Polars Dataframes
print(df.filter(df['Price'] > 15))
print(df.filter((df['Price'] > 15) & (df['Price'] < 17)))
shape: (5, 2)
┌───────────┬───────┐
│ Date ┆ Price │
│ --- ┆ --- │
│ str ┆ i64 │
╞═══════════╪═══════╡
│ 1/2/2023 ┆ 16 │
│ 1/3/2023 ┆ 16 │
│ 1/8/2023 ┆ 17 │
│ 1/9/2023 ┆ 16 │
│ 1/10/2023 ┆ 18 │
└───────────┴───────┘
shape: (3, 2)
┌──────────┬───────┐
│ Date ┆ Price │
│ --- ┆ --- │
│ str ┆ i64 │
╞══════════╪═══════╡
│ 1/2/2023 ┆ 16 │
│ 1/3/2023 ┆ 16 │
│ 1/9/2023 ┆ 16 │
└──────────┴───────┘
Joining/merging Polars Dataframes
df1 = pl.DataFrame({
'key': ['A', 'B', 'C', 'D'],
'value_left': [1, 2, 3, 4]
})
df2 = pl.DataFrame({
'key': ['B', 'C', 'D', 'E'],
'value_left': [5, 6, 7, 8]
})
joined_df = df1.join(df2, on = 'key')
print(joined_df)
joined_df = df1.join(df2, on = 'key', how = 'left')
print(joined_df)
shape: (3, 3)
┌─────┬────────────┬──────────────────┐
│ key ┆ value_left ┆ value_left_right │
│ --- ┆ --- ┆ --- │
│ str ┆ i64 ┆ i64 │
╞═════╪════════════╪══════════════════╡
│ B ┆ 2 ┆ 5 │
│ C ┆ 3 ┆ 6 │
│ D ┆ 4 ┆ 7 │
└─────┴────────────┴──────────────────┘
shape: (4, 3)
┌─────┬────────────┬──────────────────┐
│ key ┆ value_left ┆ value_left_right │
│ --- ┆ --- ┆ --- │
│ str ┆ i64 ┆ i64 │
╞═════╪════════════╪══════════════════╡
│ A ┆ 1 ┆ null │
│ B ┆ 2 ┆ 5 │
│ C ┆ 3 ┆ 6 │
│ D ┆ 4 ┆ 7 │
└─────┴────────────┴──────────────────┘
Aggregate Data
df = pl.DataFrame({
'Name': ['Chris', 'Chris', 'Kyle', 'John', 'John', 'Chris', 'Kyle'],
'Calories': [100,200,50,100,350,150,450]
})
df2 = df.group_by(['Name']).agg(
sum_ = pl.col('Calories').sum(),
count_ = pl.col('Calories').count()
)
print(df2)
shape: (3, 3)
┌───────┬──────┬────────┐
│ Name ┆ sum_ ┆ count_ │
│ --- ┆ --- ┆ --- │
│ str ┆ i64 ┆ u32 │
╞═══════╪══════╪════════╡
│ Kyle ┆ 500 ┆ 2 │
│ Chris ┆ 450 ┆ 3 │
│ John ┆ 450 ┆ 2 │
└───────┴──────┴────────┘
Sorting
print(df.sort(['Name', 'Calories']))
shape: (7, 2)
┌───────┬──────────┐
│ Name ┆ Calories │
│ --- ┆ --- │
│ str ┆ i64 │
╞═══════╪══════════╡
│ Chris ┆ 100 │
│ Chris ┆ 150 │
│ Chris ┆ 200 │
│ John ┆ 100 │
│ John ┆ 350 │
│ Kyle ┆ 50 │
│ Kyle ┆ 450 │
└───────┴──────────┘
Renaming Columns
renamed_col = df.rename({'Name': 'ID', 'Calories': 'Consumption'})
print(renamed_col)
shape: (7, 2)
┌───────┬─────────────┐
│ ID ┆ Consumption │
│ --- ┆ --- │
│ str ┆ i64 │
╞═══════╪═════════════╡
│ Chris ┆ 100 │
│ Chris ┆ 200 │
│ Kyle ┆ 50 │
│ John ┆ 100 │
│ John ┆ 350 │
│ Chris ┆ 150 │
│ Kyle ┆ 450 │
└───────┴─────────────┘
Select Columns
print(df2.select(['Name', 'count_']))
shape: (3, 2)
┌───────┬────────┐
│ Name ┆ count_ │
│ --- ┆ --- │
│ str ┆ u32 │
╞═══════╪════════╡
│ Kyle ┆ 2 │
│ Chris ┆ 3 │
│ John ┆ 2 │
└───────┴────────┘
Fill/Drop Null Values
df = pl.DataFrame({
'Name': ['John', 'Steven', 'Chris', None],
'Age': [20, 25, None, 32],
'Score': [99, None, 100, None]
})
print(df)
fill_df = df.fill_null('NULL_STRING')
print(fill_df)
fill_df = df.fill_null(0)
print(fill_df)
print(df.drop_nulls())
print(fill_df.drop_nulls())
shape: (4, 3)
┌────────┬──────┬───────┐
│ Name ┆ Age ┆ Score │
│ --- ┆ --- ┆ --- │
│ str ┆ i64 ┆ i64 │
╞════════╪══════╪═══════╡
│ John ┆ 20 ┆ 99 │
│ Steven ┆ 25 ┆ null │
│ Chris ┆ null ┆ 100 │
│ null ┆ 32 ┆ null │
└────────┴──────┴───────┘
shape: (4, 3)
┌─────────────┬──────┬───────┐
│ Name ┆ Age ┆ Score │
│ --- ┆ --- ┆ --- │
│ str ┆ i64 ┆ i64 │
╞═════════════╪══════╪═══════╡
│ John ┆ 20 ┆ 99 │
│ Steven ┆ 25 ┆ null │
│ Chris ┆ null ┆ 100 │
│ NULL_STRING ┆ 32 ┆ null │
└─────────────┴──────┴───────┘
shape: (4, 3)
┌────────┬─────┬───────┐
│ Name ┆ Age ┆ Score │
│ --- ┆ --- ┆ --- │
│ str ┆ i64 ┆ i64 │
╞════════╪═════╪═══════╡
│ John ┆ 20 ┆ 99 │
│ Steven ┆ 25 ┆ 0 │
│ Chris ┆ 0 ┆ 100 │
│ null ┆ 32 ┆ 0 │
└────────┴─────┴───────┘
shape: (1, 3)
┌──────┬─────┬───────┐
│ Name ┆ Age ┆ Score │
│ --- ┆ --- ┆ --- │
│ str ┆ i64 ┆ i64 │
╞══════╪═════╪═══════╡
│ John ┆ 20 ┆ 99 │
└──────┴─────┴───────┘
shape: (3, 3)
┌────────┬─────┬───────┐
│ Name ┆ Age ┆ Score │
│ --- ┆ --- ┆ --- │
│ str ┆ i64 ┆ i64 │
╞════════╪═════╪═══════╡
│ John ┆ 20 ┆ 99 │
│ Steven ┆ 25 ┆ 0 │
│ Chris ┆ 0 ┆ 100 │
└────────┴─────┴───────┘
Convert String To Date
df = pl.DataFrame({'Date': ['1/1/2023','1/2/2023','1/3/2023','1/4/2023','1/5/2023','1/6/2023','1/7/2023','1/8/2023','1/9/2023','1/10/2023'],
'Price': [15,16,16,15,14,13,14,17,16,18]})
df = df.with_columns(Date2 = df['Date'].str.to_date())
print(df)
df = df.with_columns(Date3 = df['Date2'].dt.strftime('%m%Y'))
print(df)
shape: (10, 3)
┌───────────┬───────┬────────────┐
│ Date ┆ Price ┆ Date2 │
│ --- ┆ --- ┆ --- │
│ str ┆ i64 ┆ date │
╞═══════════╪═══════╪════════════╡
│ 1/1/2023 ┆ 15 ┆ 2023-01-01 │
│ 1/2/2023 ┆ 16 ┆ 2023-02-01 │
│ 1/3/2023 ┆ 16 ┆ 2023-03-01 │
│ 1/4/2023 ┆ 15 ┆ 2023-04-01 │
│ 1/5/2023 ┆ 14 ┆ 2023-05-01 │
│ 1/6/2023 ┆ 13 ┆ 2023-06-01 │
│ 1/7/2023 ┆ 14 ┆ 2023-07-01 │
│ 1/8/2023 ┆ 17 ┆ 2023-08-01 │
│ 1/9/2023 ┆ 16 ┆ 2023-09-01 │
│ 1/10/2023 ┆ 18 ┆ 2023-10-01 │
└───────────┴───────┴────────────┘
shape: (10, 4)
┌───────────┬───────┬────────────┬────────┐
│ Date ┆ Price ┆ Date2 ┆ Date3 │
│ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ i64 ┆ date ┆ str │
╞═══════════╪═══════╪════════════╪════════╡
│ 1/1/2023 ┆ 15 ┆ 2023-01-01 ┆ 012023 │
│ 1/2/2023 ┆ 16 ┆ 2023-02-01 ┆ 022023 │
│ 1/3/2023 ┆ 16 ┆ 2023-03-01 ┆ 032023 │
│ 1/4/2023 ┆ 15 ┆ 2023-04-01 ┆ 042023 │
│ 1/5/2023 ┆ 14 ┆ 2023-05-01 ┆ 052023 │
│ 1/6/2023 ┆ 13 ┆ 2023-06-01 ┆ 062023 │
│ 1/7/2023 ┆ 14 ┆ 2023-07-01 ┆ 072023 │
│ 1/8/2023 ┆ 17 ┆ 2023-08-01 ┆ 082023 │
│ 1/9/2023 ┆ 16 ┆ 2023-09-01 ┆ 092023 │
│ 1/10/2023 ┆ 18 ┆ 2023-10-01 ┆ 102023 │
└───────────┴───────┴────────────┴────────┘
Change Column Data Type
df = df.with_columns(Price = df['Price'].cast(pl.Utf8))
print(df)
df = df.with_columns(Price = df['Price'].cast(str))
print(df)
df = df.with_columns(Price = df['Price'].cast(pl.Float64))
print(df)
shape: (10, 4)
┌───────────┬───────┬────────────┬────────┐
│ Date ┆ Price ┆ Date2 ┆ Date3 │
│ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ str ┆ date ┆ str │
╞═══════════╪═══════╪════════════╪════════╡
│ 1/1/2023 ┆ 15 ┆ 2023-01-01 ┆ 012023 │
│ 1/2/2023 ┆ 16 ┆ 2023-02-01 ┆ 022023 │
│ 1/3/2023 ┆ 16 ┆ 2023-03-01 ┆ 032023 │
│ 1/4/2023 ┆ 15 ┆ 2023-04-01 ┆ 042023 │
│ 1/5/2023 ┆ 14 ┆ 2023-05-01 ┆ 052023 │
│ 1/6/2023 ┆ 13 ┆ 2023-06-01 ┆ 062023 │
│ 1/7/2023 ┆ 14 ┆ 2023-07-01 ┆ 072023 │
│ 1/8/2023 ┆ 17 ┆ 2023-08-01 ┆ 082023 │
│ 1/9/2023 ┆ 16 ┆ 2023-09-01 ┆ 092023 │
│ 1/10/2023 ┆ 18 ┆ 2023-10-01 ┆ 102023 │
└───────────┴───────┴────────────┴────────┘
shape: (10, 4)
┌───────────┬───────┬────────────┬────────┐
│ Date ┆ Price ┆ Date2 ┆ Date3 │
│ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ str ┆ date ┆ str │
╞═══════════╪═══════╪════════════╪════════╡
│ 1/1/2023 ┆ 15 ┆ 2023-01-01 ┆ 012023 │
│ 1/2/2023 ┆ 16 ┆ 2023-02-01 ┆ 022023 │
│ 1/3/2023 ┆ 16 ┆ 2023-03-01 ┆ 032023 │
│ 1/4/2023 ┆ 15 ┆ 2023-04-01 ┆ 042023 │
│ 1/5/2023 ┆ 14 ┆ 2023-05-01 ┆ 052023 │
│ 1/6/2023 ┆ 13 ┆ 2023-06-01 ┆ 062023 │
│ 1/7/2023 ┆ 14 ┆ 2023-07-01 ┆ 072023 │
│ 1/8/2023 ┆ 17 ┆ 2023-08-01 ┆ 082023 │
│ 1/9/2023 ┆ 16 ┆ 2023-09-01 ┆ 092023 │
│ 1/10/2023 ┆ 18 ┆ 2023-10-01 ┆ 102023 │
└───────────┴───────┴────────────┴────────┘
shape: (10, 4)
┌───────────┬───────┬────────────┬────────┐
│ Date ┆ Price ┆ Date2 ┆ Date3 │
│ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ f64 ┆ date ┆ str │
╞═══════════╪═══════╪════════════╪════════╡
│ 1/1/2023 ┆ 15.0 ┆ 2023-01-01 ┆ 012023 │
│ 1/2/2023 ┆ 16.0 ┆ 2023-02-01 ┆ 022023 │
│ 1/3/2023 ┆ 16.0 ┆ 2023-03-01 ┆ 032023 │
│ 1/4/2023 ┆ 15.0 ┆ 2023-04-01 ┆ 042023 │
│ 1/5/2023 ┆ 14.0 ┆ 2023-05-01 ┆ 052023 │
│ 1/6/2023 ┆ 13.0 ┆ 2023-06-01 ┆ 062023 │
│ 1/7/2023 ┆ 14.0 ┆ 2023-07-01 ┆ 072023 │
│ 1/8/2023 ┆ 17.0 ┆ 2023-08-01 ┆ 082023 │
│ 1/9/2023 ┆ 16.0 ┆ 2023-09-01 ┆ 092023 │
│ 1/10/2023 ┆ 18.0 ┆ 2023-10-01 ┆ 102023 │
└───────────┴───────┴────────────┴────────┘
Shift Values
df = df.with_columns(Shifted = df['Date3'].shift(1)
.fill_null('SHIFTED'))
print(df)
shape: (10, 5)
┌───────────┬───────┬────────────┬────────┬─────────┐
│ Date ┆ Price ┆ Date2 ┆ Date3 ┆ Shifted │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ f64 ┆ date ┆ str ┆ str │
╞═══════════╪═══════╪════════════╪════════╪═════════╡
│ 1/1/2023 ┆ 15.0 ┆ 2023-01-01 ┆ 012023 ┆ SHIFTED │
│ 1/2/2023 ┆ 16.0 ┆ 2023-02-01 ┆ 022023 ┆ 012023 │
│ 1/3/2023 ┆ 16.0 ┆ 2023-03-01 ┆ 032023 ┆ 022023 │
│ 1/4/2023 ┆ 15.0 ┆ 2023-04-01 ┆ 042023 ┆ 032023 │
│ 1/5/2023 ┆ 14.0 ┆ 2023-05-01 ┆ 052023 ┆ 042023 │
│ 1/6/2023 ┆ 13.0 ┆ 2023-06-01 ┆ 062023 ┆ 052023 │
│ 1/7/2023 ┆ 14.0 ┆ 2023-07-01 ┆ 072023 ┆ 062023 │
│ 1/8/2023 ┆ 17.0 ┆ 2023-08-01 ┆ 082023 ┆ 072023 │
│ 1/9/2023 ┆ 16.0 ┆ 2023-09-01 ┆ 092023 ┆ 082023 │
│ 1/10/2023 ┆ 18.0 ┆ 2023-10-01 ┆ 102023 ┆ 092023 │
└───────────┴───────┴────────────┴────────┴─────────┘
Export Polars Dataframe To CSV
There are other file formats you can export the DataFrame into, such as write_excel
and write_parquet
.
# df.write_csv('df.csv')
Read Multi-Sheet Excel File
# Read the first sheet.
# df = pl.read_excel('data2.xlsx')
# Read the second sheet called "Sheet2".
# df = pl.read_excel('data2.xlsx', sheet_name = 'Sheet2')
Reading Text Files
Read a text file with |
delimited columns.
# df = pl.read_csv('pipe_sep.txt', seperator = '|')