Polars Tutorial2

Most of these commands are derived with the help of ChatGPT or stackoverflow. However, it is easier to look for a command here rather than searching for the answers to these questions over and over again.

There might be a few Python examples here not directly related to Polars!

Read `.` as missing value

import polars as pl
# df = pl.read_csv("your_file.csv", null_values=".", has_header = True)

Replace `.` with missing value

df = pl.DataFrame({
    "col1": [".", "AB456", "GK789", "."],
    "col2": [10, 20, 30, 40],
    "col3": [1, 2, 3, 4]
})

df = df.with_columns(pl.col(pl.String).replace({".": None}))

print(df)

shape: (4, 3)
┌───────┬──────┬──────┐
│ col1  ┆ col2 ┆ col3 │
│ ---   ┆ ---  ┆ ---  │
│ str   ┆ i64  ┆ i64  │
╞═══════╪══════╪══════╡
│ null  ┆ 10   ┆ 1    │
│ AB456 ┆ 20   ┆ 2    │
│ GK789 ┆ 30   ┆ 3    │
│ null  ┆ 40   ┆ 4    │
└───────┴──────┴──────┘

Get column names

df = pl.DataFrame({
    "col1": ["GK123", "AB456", "GK789", "CD012"],
    "col2": [10, 20, 30, 40],
    "col3": [1, 2, 3, 4]
})

print(df.columns)

# Get the last two columns
print(df.columns[-2:])

# Get the first two columns
print(df.columns[:2])

['col1', 'col2', 'col3']
['col2', 'col3']
['col1', 'col2']

Change values in a column based on a condition in another column

df = pl.DataFrame({
    "col1": ["GK123", "AB456", "GK789", "CD012"],
    "col2": [10, 20, 30, 40]
})

# Replace values in 'col2' based on the condition that 'col1' starts with 'GK'
df = df.with_columns(
    pl.when(pl.col("col1").str.starts_with("GK"))
    .then(999)  # Replace with your desired value
    .otherwise(pl.col("col2"))  # Keep original value if condition is not met
    .alias("col2")
)

print(df)

shape: (4, 2)
┌───────┬──────┐
│ col1  ┆ col2 │
│ ---   ┆ ---  │
│ str   ┆ i64  │
╞═══════╪══════╡
│ GK123 ┆ 999  │
│ AB456 ┆ 20   │
│ GK789 ┆ 999  │
│ CD012 ┆ 40   │
└───────┴──────┘

Change values in a column based on multiple conditions in another column

df = pl.DataFrame({
    "col1": ["GK123", "AB456", "GK789", "CD012"],
    "col2": [10, 25, 30, 40]
})

# Replace values in 'col2' based on multiple conditions
df = df.with_columns(
    pl.when((pl.col("col1").str.starts_with("GK")) & (pl.col("col2") > 20))
    .then(999)  # Replace with your desired value
    .otherwise(pl.col("col2"))  # Keep original value if conditions are not met
    .alias("col2")
)

print(df)

df = pl.DataFrame({
    "col1": ["GK123", "AB456", "GK789", "CD012"],
    "col2": [10, 25, 30, 40]
})

df = df.with_columns(
    pl.when(pl.col("col2") < 11).then(20)
    .when(pl.col("col2") > 33).then(pl.col("col2") * 2)
    .otherwise(pl.col("col2"))
    .alias("col2")
)

print(df)

shape: (4, 2)
┌───────┬──────┐
│ col1  ┆ col2 │
│ ---   ┆ ---  │
│ str   ┆ i64  │
╞═══════╪══════╡
│ GK123 ┆ 10   │
│ AB456 ┆ 25   │
│ GK789 ┆ 999  │
│ CD012 ┆ 40   │
└───────┴──────┘
shape: (4, 2)
┌───────┬──────┐
│ col1  ┆ col2 │
│ ---   ┆ ---  │
│ str   ┆ i64  │
╞═══════╪══════╡
│ GK123 ┆ 20   │
│ AB456 ┆ 25   │
│ GK789 ┆ 30   │
│ CD012 ┆ 80   │
└───────┴──────┘

Extract a column into a vector

col1_vector = df["col1"].to_list()

print(col1_vector)

['GK123', 'AB456', 'GK789', 'CD012']

Extract values meeting a condition from a column

df = pl.DataFrame({
    "col1": [1, 2, 3],
    "col2": [4, 5, 6]
})

# Extract values from 'col1' that are greater than 1
filtered_values = df.filter(pl.col("col1") > 1)["col1"].to_list()

print(filtered_values)

[2, 3]

Get the dimension of the DataFrame

df.shape

(3, 2)

Paste a text string to a range of numbers

# Generate the sequence from 1001 to 1010 and concatenate "DK" with each value
result = [f"DK{1000 + i}" for i in range(1, 11)]

print(result)

result = ["DK" + str(i) for i in range(1001, 1011)]

print(result)

['DK1001', 'DK1002', 'DK1003', 'DK1004', 'DK1005', 'DK1006', 'DK1007', 'DK1008', 'DK1009', 'DK1010']
['DK1001', 'DK1002', 'DK1003', 'DK1004', 'DK1005', 'DK1006', 'DK1007', 'DK1008', 'DK1009', 'DK1010']

Remove Initial characters in a string list

string_list = ["AB123", "CD456", "EF789", "GH012"]

# Remove the first two characters from each string
modified_list = [s[2:] for s in string_list]

print(modified_list)

['123', '456', '789', '012']

Copy a column into another when a condition is met

df = pl.DataFrame({
    "col1": ["GK123", "AB456", "GK789", "CD012"],
    "col2": [10, 20, 30, 40]
})

# Copy 'col2' to 'col1' where 'col1' starts with 'GK'
df = df.with_columns(
    pl.when(pl.col("col1").str.starts_with("GK"))
    .then(pl.col("col2"))  # Copy 'col2' value to 'col1'
    .otherwise(pl.col("col1"))  # Keep original 'col1' value if condition is not met
    .alias("col1")
)

print(df)

shape: (4, 2)
┌───────┬──────┐
│ col1  ┆ col2 │
│ ---   ┆ ---  │
│ str   ┆ i64  │
╞═══════╪══════╡
│ 10    ┆ 10   │
│ AB456 ┆ 20   │
│ 30    ┆ 30   │
│ CD012 ┆ 40   │
└───────┴──────┘

Copy a list into a DataFrame column when a condition is met

df = pl.DataFrame({
    "col1": ["GK123", "AB456", "GK789", "CD012"],
    "col2": [10, 20, 30, 40]
})

new_values = ["XY987", "ZW654", "XY987", "ZW654"]

# Copy 'new_values' to 'col1' where 'col1' starts with 'GK'
df = df.with_columns(
    pl.when(pl.col("col1").str.starts_with("GK"))
    .then(pl.Series(new_values))  # Assign new values with the same shape
    .otherwise(pl.col("col1"))
    .alias("col1")
)

print(df)

shape: (4, 2)
┌───────┬──────┐
│ col1  ┆ col2 │
│ ---   ┆ ---  │
│ str   ┆ i64  │
╞═══════╪══════╡
│ XY987 ┆ 10   │
│ AB456 ┆ 20   │
│ XY987 ┆ 30   │
│ CD012 ┆ 40   │
└───────┴──────┘

Copy a list into a column

df = df.with_columns(pl.Series("col1", new_values))

print(df)

df = pl.DataFrame({
    "col1": ["GK123", "AB456", "GK789", "CD012"],
    "col2": [10, 20, 30, 40]
})

df = df.with_columns(col1 = pl.lit(pl.Series(new_values)))

print(df)

shape: (4, 2)
┌───────┬──────┐
│ col1  ┆ col2 │
│ ---   ┆ ---  │
│ str   ┆ i64  │
╞═══════╪══════╡
│ XY987 ┆ 10   │
│ ZW654 ┆ 20   │
│ XY987 ┆ 30   │
│ ZW654 ┆ 40   │
└───────┴──────┘
shape: (4, 2)
┌───────┬──────┐
│ col1  ┆ col2 │
│ ---   ┆ ---  │
│ str   ┆ i64  │
╞═══════╪══════╡
│ XY987 ┆ 10   │
│ ZW654 ┆ 20   │
│ XY987 ┆ 30   │
│ ZW654 ┆ 40   │
└───────┴──────┘

Copy row-sums (`rowSums`) of columns “a” and “b” into column “a”

df = pl.DataFrame({
    "a": [1, 2, 3],
    "b": [4, 5, 6]
})

# Compute the row-wise sum of columns "a" and "b" and store it in column "a"
df = df.with_columns(
    (pl.col("a") + pl.col("b")).alias("a")
)

print(df)

shape: (3, 2)
┌─────┬─────┐
│ a   ┆ b   │
│ --- ┆ --- │
│ i64 ┆ i64 │
╞═════╪═════╡
│ 5   ┆ 4   │
│ 7   ┆ 5   │
│ 9   ┆ 6   │
└─────┴─────┘

Drop columns with the column name ending with a pattern

df = pl.DataFrame({
    "col1": [1, 2, 3],
    "col2_right": [4, 5, 6],
    "col3": [7, 8, 9],
    "col4_right": [10, 11, 12]
})

# Drop columns that end with "_right"
df = df.select([col for col in df.columns if not col.endswith("_right")])

print(df)

shape: (3, 2)
┌──────┬──────┐
│ col1 ┆ col3 │
│ ---  ┆ ---  │
│ i64  ┆ i64  │
╞══════╪══════╡
│ 1    ┆ 7    │
│ 2    ┆ 8    │
│ 3    ┆ 9    │
└──────┴──────┘

Rename columns with the column name ending with a pattern

df = pl.DataFrame({
    "col1": [1, 2, 3],
    "col2_right": [4, 5, 6],
    "col3": [7, 8, 9],
    "col4_right": [10, 11, 12]
})

# Rename columns ending with "_right" to "d*" (d1, d2, ...)
new_names = {}
counter = 1

for col in df.columns:
    if col.endswith("_right"):
        new_names[col] = f"d{counter}"
        counter += 1

# Apply the renaming
df = df.rename(new_names)

print(df)

df = pl.DataFrame({
    "col1": [1, 2, 3],
    "col2_right": [4, 5, 6],
    "col3": [7, 8, 9],
    "col4_right": [10, 11, 12]
})

# Create a dictionary to hold the renaming mappings
new_names = {}

# Iterate through the columns and rename those ending with "_right"
for col in df.columns:
    if col.endswith("_right"):
        # Extract the part before "_right" and prepend "d"
        base_name = col.split("_right")[0]
        new_names[col] = f"d{base_name}"

# Apply the renaming
df = df.rename(new_names)

print(df)

shape: (3, 4)
┌──────┬─────┬──────┬─────┐
│ col1 ┆ d1  ┆ col3 ┆ d2  │
│ ---  ┆ --- ┆ ---  ┆ --- │
│ i64  ┆ i64 ┆ i64  ┆ i64 │
╞══════╪═════╪══════╪═════╡
│ 1    ┆ 4   ┆ 7    ┆ 10  │
│ 2    ┆ 5   ┆ 8    ┆ 11  │
│ 3    ┆ 6   ┆ 9    ┆ 12  │
└──────┴─────┴──────┴─────┘
shape: (3, 4)
┌──────┬───────┬──────┬───────┐
│ col1 ┆ dcol2 ┆ col3 ┆ dcol4 │
│ ---  ┆ ---   ┆ ---  ┆ ---   │
│ i64  ┆ i64   ┆ i64  ┆ i64   │
╞══════╪═══════╪══════╪═══════╡
│ 1    ┆ 4     ┆ 7    ┆ 10    │
│ 2    ┆ 5     ┆ 8    ┆ 11    │
│ 3    ┆ 6     ┆ 9    ┆ 12    │
└──────┴───────┴──────┴───────┘

Rename columns based on column index

df = pl.DataFrame({
    "original1": ["123-456", "789-012", "345-678"],
    "original2": [1, 2, 3],
    "original3": [10, 20, 30]
})

# Get the first two column names
old_names = df.columns[:2]

# New names for the first two columns
new_names = ["col1", "col2"]

# Rename the first two columns
df = df.rename({old_names[0]: new_names[0], old_names[1]: new_names[1]})

print(df)

shape: (3, 3)
┌─────────┬──────┬───────────┐
│ col1    ┆ col2 ┆ original3 │
│ ---     ┆ ---  ┆ ---       │
│ str     ┆ i64  ┆ i64       │
╞═════════╪══════╪═══════════╡
│ 123-456 ┆ 1    ┆ 10        │
│ 789-012 ┆ 2    ┆ 20        │
│ 345-678 ┆ 3    ┆ 30        │
└─────────┴──────┴───────────┘

Rename all columns with a list

df = pl.DataFrame({
    "old_name1": [1, 2, 3],
    "old_name2": [4, 5, 6],
    "old_name3": [7, 8, 9]
})

# List of new column names
new_column_names = ["new_name1", "new_name2", "new_name3"]

# Rename all columns
df = df.rename({old: new for old, new in zip(df.columns, new_column_names)})

print(df)

shape: (3, 3)
┌───────────┬───────────┬───────────┐
│ new_name1 ┆ new_name2 ┆ new_name3 │
│ ---       ┆ ---       ┆ ---       │
│ i64       ┆ i64       ┆ i64       │
╞═══════════╪═══════════╪═══════════╡
│ 1         ┆ 4         ┆ 7         │
│ 2         ┆ 5         ┆ 8         │
│ 3         ┆ 6         ┆ 9         │
└───────────┴───────────┴───────────┘

The difference between `.str.replace` and `.str.replace_all`

df = pl.DataFrame({
    "col1": ["123-456-34", "789-012-78", "345-678-02"],
    "col2": [1, 2, 3]
})

# Replace "-" with "" in the "col1" column
df = df.with_columns(
    pl.col("col1").str.replace("-", "").alias("col1")
)

print(df)

df = pl.DataFrame({
    "col1": ["123-456", "789-012", "345-678"],
    "col2": [1, 2, 3]
})

# Replace "-" with "" in the "col1" column
df = df.with_columns(
    pl.col("col1").str.replace_all("-", "").alias("col1")
)

print(df)

shape: (3, 2)
┌───────────┬──────┐
│ col1      ┆ col2 │
│ ---       ┆ ---  │
│ str       ┆ i64  │
╞═══════════╪══════╡
│ 123456-34 ┆ 1    │
│ 789012-78 ┆ 2    │
│ 345678-02 ┆ 3    │
└───────────┴──────┘
shape: (3, 2)
┌────────┬──────┐
│ col1   ┆ col2 │
│ ---    ┆ ---  │
│ str    ┆ i64  │
╞════════╪══════╡
│ 123456 ┆ 1    │
│ 789012 ┆ 2    │
│ 345678 ┆ 3    │
└────────┴──────┘

Read specific columns from a CSV file

# df = pl.read_csv("data.csv", columns=[0, 2])

Replace null with 0 in the last two columns

df = pl.DataFrame({
    "col1": [1, 2, 3],
    "col2": [None, 5, None],
    "col3": [7, None, 9]
})

# Replace null with 0 in the last two columns
df = df.with_columns(
    [pl.col(df.columns[-2:]).fill_null(0)]
)

print(df)

shape: (3, 3)
┌──────┬──────┬──────┐
│ col1 ┆ col2 ┆ col3 │
│ ---  ┆ ---  ┆ ---  │
│ i64  ┆ i64  ┆ i64  │
╞══════╪══════╪══════╡
│ 1    ┆ 0    ┆ 7    │
│ 2    ┆ 5    ┆ 0    │
│ 3    ┆ 0    ┆ 9    │
└──────┴──────┴──────┘

Drop the first two characters from all elements in a DataFrame

df = pl.DataFrame({
    "col1": ["ab123", "ab456", "ab789"],
    "col2": ["cd001", "cd002", "cd003"]
})

# Drop the first two characters from all elements in the DataFrame
df = df.with_columns(
    [pl.col(c).str.slice(2) for c in df.columns]
)

print(df)

# Now, do the same, but turn the resulting elements into integers.

df = pl.DataFrame({
    "col1": ["ab123", "ab456", "ab789"],
    "col2": ["cd001", "cd002", "cd003"]
})

# Drop the first two characters and convert the remaining strings to integers
df = df.with_columns(
    [pl.col(c).str.slice(2).cast(pl.Int64) for c in df.columns]
)

print(df)

shape: (3, 2)
┌──────┬──────┐
│ col1 ┆ col2 │
│ ---  ┆ ---  │
│ str  ┆ str  │
╞══════╪══════╡
│ 123  ┆ 001  │
│ 456  ┆ 002  │
│ 789  ┆ 003  │
└──────┴──────┘
shape: (3, 2)
┌──────┬──────┐
│ col1 ┆ col2 │
│ ---  ┆ ---  │
│ i64  ┆ i64  │
╞══════╪══════╡
│ 123  ┆ 1    │
│ 456  ┆ 2    │
│ 789  ┆ 3    │
└──────┴──────┘

Drop a column by its index number

df = pl.DataFrame({
    "col1": [1, 2, 3],
    "col2": [4, 5, 6],
    "col3": [7, 8, 9]
})

# Drop the first column by its index (0)
df = df.drop(df.columns[0])

print(df)

shape: (3, 2)
┌──────┬──────┐
│ col2 ┆ col3 │
│ ---  ┆ ---  │
│ i64  ┆ i64  │
╞══════╪══════╡
│ 4    ┆ 7    │
│ 5    ┆ 8    │
│ 6    ┆ 9    │
└──────┴──────┘

Reorder columns

df.select(['col3', 'col2'])

shape: (3, 2)

col3	col2
i64	i64
7	4
8	5
9	6

df.select([pl.col('col3'), pl.col('col2')])

shape: (3, 2)

col3	col2
i64	i64
7	4
8	5
9	6

Reorder DataFrame’s rows by matching a column with a list

df = pl.DataFrame({
    "col1": ["b", "c", "a", "e", "d"],
    "col2": [1, 2, 3, 4, 5]
})

# The desired order for "col1"
desired_order = ["a", "b", "c", "d", "e"]

# Reorder the DataFrame by matching "col1" with the desired_order list
df_reordered = df.with_columns(
    pl.col("col1").map_elements(lambda x: desired_order.index(x), return_dtype=pl.Int64).alias("sort_key")
).sort("sort_key").drop("sort_key")

print(df_reordered)

# or

df_reordered = df.join(pl.DataFrame({"col1": desired_order}), on = "col1")

print(df_reordered)

shape: (5, 2)
┌──────┬──────┐
│ col1 ┆ col2 │
│ ---  ┆ ---  │
│ str  ┆ i64  │
╞══════╪══════╡
│ a    ┆ 3    │
│ b    ┆ 1    │
│ c    ┆ 2    │
│ d    ┆ 5    │
│ e    ┆ 4    │
└──────┴──────┘
shape: (5, 2)
┌──────┬──────┐
│ col1 ┆ col2 │
│ ---  ┆ ---  │
│ str  ┆ i64  │
╞══════╪══════╡
│ a    ┆ 3    │
│ b    ┆ 1    │
│ c    ┆ 2    │
│ d    ┆ 5    │
│ e    ┆ 4    │
└──────┴──────┘

Filter the DataFrame for elements containing a pattern in a column

df = pl.DataFrame({
    "col1": ["abc123", "def456", "ghi789", "abcxyz"],
    "col2": [1, 2, 3, 4]
})

# Filter rows where "col1" contains the pattern "abc"
filtered_df = df.filter(pl.col("col1").str.contains("abc"))

print(filtered_df)

shape: (2, 2)
┌────────┬──────┐
│ col1   ┆ col2 │
│ ---    ┆ ---  │
│ str    ┆ i64  │
╞════════╪══════╡
│ abc123 ┆ 1    │
│ abcxyz ┆ 4    │
└────────┴──────┘

Compute the dot/inner product (crossproduct) between two Series

s = pl.Series("a", [1, 2, 3])
s2 = pl.Series("b", [4.0, 5.0, 6.0])
s.dot(s2)

32.0

Extract rows of a DataFrame in a list of tuples

print(df.rows())

[('abc123', 1), ('def456', 2), ('ghi789', 3), ('abcxyz', 4)]

Get `rowSums` of a DataFrame

df = pl.DataFrame({
    "col1": [1, 2, 3],
    "col2": [4, 5, 6],
    "col3": [7, 8, 9]
})

df = df.with_columns(df.select(pl.sum_horizontal("*").alias("row_sum")))
df.with_columns(df.select(pl.sum_horizontal(["col1","col2"]).alias("row_sum")))

print(df)

shape: (3, 4)
┌──────┬──────┬──────┬─────────┐
│ col1 ┆ col2 ┆ col3 ┆ row_sum │
│ ---  ┆ ---  ┆ ---  ┆ ---     │
│ i64  ┆ i64  ┆ i64  ┆ i64     │
╞══════╪══════╪══════╪═════════╡
│ 1    ┆ 4    ┆ 7    ┆ 12      │
│ 2    ┆ 5    ┆ 8    ┆ 15      │
│ 3    ┆ 6    ┆ 9    ┆ 18      │
└──────┴──────┴──────┴─────────┘

head and tail

print(df.head())
print(df.head(2))
print(df.tail())
print(df.tail(2))

shape: (3, 4)
┌──────┬──────┬──────┬─────────┐
│ col1 ┆ col2 ┆ col3 ┆ row_sum │
│ ---  ┆ ---  ┆ ---  ┆ ---     │
│ i64  ┆ i64  ┆ i64  ┆ i64     │
╞══════╪══════╪══════╪═════════╡
│ 1    ┆ 4    ┆ 7    ┆ 12      │
│ 2    ┆ 5    ┆ 8    ┆ 15      │
│ 3    ┆ 6    ┆ 9    ┆ 18      │
└──────┴──────┴──────┴─────────┘
shape: (2, 4)
┌──────┬──────┬──────┬─────────┐
│ col1 ┆ col2 ┆ col3 ┆ row_sum │
│ ---  ┆ ---  ┆ ---  ┆ ---     │
│ i64  ┆ i64  ┆ i64  ┆ i64     │
╞══════╪══════╪══════╪═════════╡
│ 1    ┆ 4    ┆ 7    ┆ 12      │
│ 2    ┆ 5    ┆ 8    ┆ 15      │
└──────┴──────┴──────┴─────────┘
shape: (3, 4)
┌──────┬──────┬──────┬─────────┐
│ col1 ┆ col2 ┆ col3 ┆ row_sum │
│ ---  ┆ ---  ┆ ---  ┆ ---     │
│ i64  ┆ i64  ┆ i64  ┆ i64     │
╞══════╪══════╪══════╪═════════╡
│ 1    ┆ 4    ┆ 7    ┆ 12      │
│ 2    ┆ 5    ┆ 8    ┆ 15      │
│ 3    ┆ 6    ┆ 9    ┆ 18      │
└──────┴──────┴──────┴─────────┘
shape: (2, 4)
┌──────┬──────┬──────┬─────────┐
│ col1 ┆ col2 ┆ col3 ┆ row_sum │
│ ---  ┆ ---  ┆ ---  ┆ ---     │
│ i64  ┆ i64  ┆ i64  ┆ i64     │
╞══════╪══════╪══════╪═════════╡
│ 2    ┆ 5    ┆ 8    ┆ 15      │
│ 3    ┆ 6    ┆ 9    ┆ 18      │
└──────┴──────┴──────┴─────────┘

Matrix-vector multiplication

import numpy as np

a = pl.DataFrame({
    "col1": [5,1,3,2],
    "col2": [1,1,1,3],
    "col3": [1,2,1,4]
})

b = [1, 2, 3]

print(a.to_numpy().dot(b))

[10  9  8 20]

Read . as missing value#

Replace . with missing value#

Get column names#

Change values in a column based on a condition in another column#

Change values in a column based on multiple conditions in another column#

Extract a column into a vector#

Extract values meeting a condition from a column#

Get the dimension of the DataFrame#

Paste a text string to a range of numbers#

Remove Initial characters in a string list#

Copy a column into another when a condition is met#

Copy a list into a DataFrame column when a condition is met#

Copy a list into a column#

Copy row-sums (rowSums) of columns “a” and “b” into column “a”#

Drop columns with the column name ending with a pattern#

Rename columns with the column name ending with a pattern#

Rename columns based on column index#

Rename all columns with a list#

The difference between .str.replace and .str.replace_all#

Read specific columns from a CSV file#

Replace null with 0 in the last two columns#

Drop the first two characters from all elements in a DataFrame#

Drop a column by its index number#

Reorder columns#

Reorder DataFrame’s rows by matching a column with a list#

Filter the DataFrame for elements containing a pattern in a column#

Compute the dot/inner product (crossproduct) between two Series#

Extract rows of a DataFrame in a list of tuples#

Get rowSums of a DataFrame#

head and tail#

Matrix-vector multiplication#