Most of these commands are derived with the help of ChatGPT or stackoverflow. However, it is easier to look for a command here rather than searching for the answers to these questions over and over again.
There might be a few Python examples here not directly related to Polars!
Read .
as missing value
import polars as pl
# df = pl.read_csv("your_file.csv", null_values=".", has_header = True)
Replace .
with missing value
df = pl.DataFrame({
"col1": [".", "AB456", "GK789", "."],
"col2": [10, 20, 30, 40],
"col3": [1, 2, 3, 4]
})
df = df.with_columns(pl.col(pl.String).replace({".": None}))
print(df)
shape: (4, 3)
┌───────┬──────┬──────┐
│ col1 ┆ col2 ┆ col3 │
│ --- ┆ --- ┆ --- │
│ str ┆ i64 ┆ i64 │
╞═══════╪══════╪══════╡
│ null ┆ 10 ┆ 1 │
│ AB456 ┆ 20 ┆ 2 │
│ GK789 ┆ 30 ┆ 3 │
│ null ┆ 40 ┆ 4 │
└───────┴──────┴──────┘
Get column names
df = pl.DataFrame({
"col1": ["GK123", "AB456", "GK789", "CD012"],
"col2": [10, 20, 30, 40],
"col3": [1, 2, 3, 4]
})
print(df.columns)
# Get the last two columns
print(df.columns[-2:])
# Get the first two columns
print(df.columns[:2])
['col1', 'col2', 'col3']
['col2', 'col3']
['col1', 'col2']
Change values in a column based on a condition in another column
df = pl.DataFrame({
"col1": ["GK123", "AB456", "GK789", "CD012"],
"col2": [10, 20, 30, 40]
})
# Replace values in 'col2' based on the condition that 'col1' starts with 'GK'
df = df.with_columns(
pl.when(pl.col("col1").str.starts_with("GK"))
.then(999) # Replace with your desired value
.otherwise(pl.col("col2")) # Keep original value if condition is not met
.alias("col2")
)
print(df)
shape: (4, 2)
┌───────┬──────┐
│ col1 ┆ col2 │
│ --- ┆ --- │
│ str ┆ i64 │
╞═══════╪══════╡
│ GK123 ┆ 999 │
│ AB456 ┆ 20 │
│ GK789 ┆ 999 │
│ CD012 ┆ 40 │
└───────┴──────┘
Change values in a column based on multiple conditions in another column
df = pl.DataFrame({
"col1": ["GK123", "AB456", "GK789", "CD012"],
"col2": [10, 25, 30, 40]
})
# Replace values in 'col2' based on multiple conditions
df = df.with_columns(
pl.when((pl.col("col1").str.starts_with("GK")) & (pl.col("col2") > 20))
.then(999) # Replace with your desired value
.otherwise(pl.col("col2")) # Keep original value if conditions are not met
.alias("col2")
)
print(df)
df = pl.DataFrame({
"col1": ["GK123", "AB456", "GK789", "CD012"],
"col2": [10, 25, 30, 40]
})
df = df.with_columns(
pl.when(pl.col("col2") < 11).then(20)
.when(pl.col("col2") > 33).then(pl.col("col2") * 2)
.otherwise(pl.col("col2"))
.alias("col2")
)
print(df)
shape: (4, 2)
┌───────┬──────┐
│ col1 ┆ col2 │
│ --- ┆ --- │
│ str ┆ i64 │
╞═══════╪══════╡
│ GK123 ┆ 10 │
│ AB456 ┆ 25 │
│ GK789 ┆ 999 │
│ CD012 ┆ 40 │
└───────┴──────┘
shape: (4, 2)
┌───────┬──────┐
│ col1 ┆ col2 │
│ --- ┆ --- │
│ str ┆ i64 │
╞═══════╪══════╡
│ GK123 ┆ 20 │
│ AB456 ┆ 25 │
│ GK789 ┆ 30 │
│ CD012 ┆ 80 │
└───────┴──────┘
Extract a column into a vector
col1_vector = df["col1"].to_list()
print(col1_vector)
['GK123', 'AB456', 'GK789', 'CD012']
Extract values meeting a condition from a column
df = pl.DataFrame({
"col1": [1, 2, 3],
"col2": [4, 5, 6]
})
# Extract values from 'col1' that are greater than 1
filtered_values = df.filter(pl.col("col1") > 1)["col1"].to_list()
print(filtered_values)
[2, 3]
Get the dimension of the DataFrame
df.shape
(3, 2)
Paste a text string to a range of numbers
# Generate the sequence from 1001 to 1010 and concatenate "DK" with each value
result = [f"DK{1000 + i}" for i in range(1, 11)]
print(result)
result = ["DK" + str(i) for i in range(1001, 1011)]
print(result)
['DK1001', 'DK1002', 'DK1003', 'DK1004', 'DK1005', 'DK1006', 'DK1007', 'DK1008', 'DK1009', 'DK1010']
['DK1001', 'DK1002', 'DK1003', 'DK1004', 'DK1005', 'DK1006', 'DK1007', 'DK1008', 'DK1009', 'DK1010']
Remove Initial characters in a string list
string_list = ["AB123", "CD456", "EF789", "GH012"]
# Remove the first two characters from each string
modified_list = [s[2:] for s in string_list]
print(modified_list)
['123', '456', '789', '012']
Copy a column into another when a condition is met
df = pl.DataFrame({
"col1": ["GK123", "AB456", "GK789", "CD012"],
"col2": [10, 20, 30, 40]
})
# Copy 'col2' to 'col1' where 'col1' starts with 'GK'
df = df.with_columns(
pl.when(pl.col("col1").str.starts_with("GK"))
.then(pl.col("col2")) # Copy 'col2' value to 'col1'
.otherwise(pl.col("col1")) # Keep original 'col1' value if condition is not met
.alias("col1")
)
print(df)
shape: (4, 2)
┌───────┬──────┐
│ col1 ┆ col2 │
│ --- ┆ --- │
│ str ┆ i64 │
╞═══════╪══════╡
│ 10 ┆ 10 │
│ AB456 ┆ 20 │
│ 30 ┆ 30 │
│ CD012 ┆ 40 │
└───────┴──────┘
Copy a list into a DataFrame column when a condition is met
df = pl.DataFrame({
"col1": ["GK123", "AB456", "GK789", "CD012"],
"col2": [10, 20, 30, 40]
})
new_values = ["XY987", "ZW654", "XY987", "ZW654"]
# Copy 'new_values' to 'col1' where 'col1' starts with 'GK'
df = df.with_columns(
pl.when(pl.col("col1").str.starts_with("GK"))
.then(pl.Series(new_values)) # Assign new values with the same shape
.otherwise(pl.col("col1"))
.alias("col1")
)
print(df)
shape: (4, 2)
┌───────┬──────┐
│ col1 ┆ col2 │
│ --- ┆ --- │
│ str ┆ i64 │
╞═══════╪══════╡
│ XY987 ┆ 10 │
│ AB456 ┆ 20 │
│ XY987 ┆ 30 │
│ CD012 ┆ 40 │
└───────┴──────┘
Copy a list into a column
df = df.with_columns(pl.Series("col1", new_values))
print(df)
df = pl.DataFrame({
"col1": ["GK123", "AB456", "GK789", "CD012"],
"col2": [10, 20, 30, 40]
})
df = df.with_columns(col1 = pl.lit(pl.Series(new_values)))
print(df)
shape: (4, 2)
┌───────┬──────┐
│ col1 ┆ col2 │
│ --- ┆ --- │
│ str ┆ i64 │
╞═══════╪══════╡
│ XY987 ┆ 10 │
│ ZW654 ┆ 20 │
│ XY987 ┆ 30 │
│ ZW654 ┆ 40 │
└───────┴──────┘
shape: (4, 2)
┌───────┬──────┐
│ col1 ┆ col2 │
│ --- ┆ --- │
│ str ┆ i64 │
╞═══════╪══════╡
│ XY987 ┆ 10 │
│ ZW654 ┆ 20 │
│ XY987 ┆ 30 │
│ ZW654 ┆ 40 │
└───────┴──────┘
Copy row-sums (rowSums
) of columns “a” and “b” into column “a”
df = pl.DataFrame({
"a": [1, 2, 3],
"b": [4, 5, 6]
})
# Compute the row-wise sum of columns "a" and "b" and store it in column "a"
df = df.with_columns(
(pl.col("a") + pl.col("b")).alias("a")
)
print(df)
shape: (3, 2)
┌─────┬─────┐
│ a ┆ b │
│ --- ┆ --- │
│ i64 ┆ i64 │
╞═════╪═════╡
│ 5 ┆ 4 │
│ 7 ┆ 5 │
│ 9 ┆ 6 │
└─────┴─────┘
Drop columns with the column name ending with a pattern
df = pl.DataFrame({
"col1": [1, 2, 3],
"col2_right": [4, 5, 6],
"col3": [7, 8, 9],
"col4_right": [10, 11, 12]
})
# Drop columns that end with "_right"
df = df.select([col for col in df.columns if not col.endswith("_right")])
print(df)
shape: (3, 2)
┌──────┬──────┐
│ col1 ┆ col3 │
│ --- ┆ --- │
│ i64 ┆ i64 │
╞══════╪══════╡
│ 1 ┆ 7 │
│ 2 ┆ 8 │
│ 3 ┆ 9 │
└──────┴──────┘
Rename columns with the column name ending with a pattern
df = pl.DataFrame({
"col1": [1, 2, 3],
"col2_right": [4, 5, 6],
"col3": [7, 8, 9],
"col4_right": [10, 11, 12]
})
# Rename columns ending with "_right" to "d*" (d1, d2, ...)
new_names = {}
counter = 1
for col in df.columns:
if col.endswith("_right"):
new_names[col] = f"d{counter}"
counter += 1
# Apply the renaming
df = df.rename(new_names)
print(df)
df = pl.DataFrame({
"col1": [1, 2, 3],
"col2_right": [4, 5, 6],
"col3": [7, 8, 9],
"col4_right": [10, 11, 12]
})
# Create a dictionary to hold the renaming mappings
new_names = {}
# Iterate through the columns and rename those ending with "_right"
for col in df.columns:
if col.endswith("_right"):
# Extract the part before "_right" and prepend "d"
base_name = col.split("_right")[0]
new_names[col] = f"d{base_name}"
# Apply the renaming
df = df.rename(new_names)
print(df)
shape: (3, 4)
┌──────┬─────┬──────┬─────┐
│ col1 ┆ d1 ┆ col3 ┆ d2 │
│ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 ┆ i64 │
╞══════╪═════╪══════╪═════╡
│ 1 ┆ 4 ┆ 7 ┆ 10 │
│ 2 ┆ 5 ┆ 8 ┆ 11 │
│ 3 ┆ 6 ┆ 9 ┆ 12 │
└──────┴─────┴──────┴─────┘
shape: (3, 4)
┌──────┬───────┬──────┬───────┐
│ col1 ┆ dcol2 ┆ col3 ┆ dcol4 │
│ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 ┆ i64 │
╞══════╪═══════╪══════╪═══════╡
│ 1 ┆ 4 ┆ 7 ┆ 10 │
│ 2 ┆ 5 ┆ 8 ┆ 11 │
│ 3 ┆ 6 ┆ 9 ┆ 12 │
└──────┴───────┴──────┴───────┘
Rename columns based on column index
df = pl.DataFrame({
"original1": ["123-456", "789-012", "345-678"],
"original2": [1, 2, 3],
"original3": [10, 20, 30]
})
# Get the first two column names
old_names = df.columns[:2]
# New names for the first two columns
new_names = ["col1", "col2"]
# Rename the first two columns
df = df.rename({old_names[0]: new_names[0], old_names[1]: new_names[1]})
print(df)
shape: (3, 3)
┌─────────┬──────┬───────────┐
│ col1 ┆ col2 ┆ original3 │
│ --- ┆ --- ┆ --- │
│ str ┆ i64 ┆ i64 │
╞═════════╪══════╪═══════════╡
│ 123-456 ┆ 1 ┆ 10 │
│ 789-012 ┆ 2 ┆ 20 │
│ 345-678 ┆ 3 ┆ 30 │
└─────────┴──────┴───────────┘
Rename all columns with a list
df = pl.DataFrame({
"old_name1": [1, 2, 3],
"old_name2": [4, 5, 6],
"old_name3": [7, 8, 9]
})
# List of new column names
new_column_names = ["new_name1", "new_name2", "new_name3"]
# Rename all columns
df = df.rename({old: new for old, new in zip(df.columns, new_column_names)})
print(df)
shape: (3, 3)
┌───────────┬───────────┬───────────┐
│ new_name1 ┆ new_name2 ┆ new_name3 │
│ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 │
╞═══════════╪═══════════╪═══════════╡
│ 1 ┆ 4 ┆ 7 │
│ 2 ┆ 5 ┆ 8 │
│ 3 ┆ 6 ┆ 9 │
└───────────┴───────────┴───────────┘
The difference between .str.replace
and .str.replace_all
df = pl.DataFrame({
"col1": ["123-456-34", "789-012-78", "345-678-02"],
"col2": [1, 2, 3]
})
# Replace "-" with "" in the "col1" column
df = df.with_columns(
pl.col("col1").str.replace("-", "").alias("col1")
)
print(df)
df = pl.DataFrame({
"col1": ["123-456", "789-012", "345-678"],
"col2": [1, 2, 3]
})
# Replace "-" with "" in the "col1" column
df = df.with_columns(
pl.col("col1").str.replace_all("-", "").alias("col1")
)
print(df)
shape: (3, 2)
┌───────────┬──────┐
│ col1 ┆ col2 │
│ --- ┆ --- │
│ str ┆ i64 │
╞═══════════╪══════╡
│ 123456-34 ┆ 1 │
│ 789012-78 ┆ 2 │
│ 345678-02 ┆ 3 │
└───────────┴──────┘
shape: (3, 2)
┌────────┬──────┐
│ col1 ┆ col2 │
│ --- ┆ --- │
│ str ┆ i64 │
╞════════╪══════╡
│ 123456 ┆ 1 │
│ 789012 ┆ 2 │
│ 345678 ┆ 3 │
└────────┴──────┘
Read specific columns from a CSV file
# df = pl.read_csv("data.csv", columns=[0, 2])
Replace null with 0 in the last two columns
df = pl.DataFrame({
"col1": [1, 2, 3],
"col2": [None, 5, None],
"col3": [7, None, 9]
})
# Replace null with 0 in the last two columns
df = df.with_columns(
[pl.col(df.columns[-2:]).fill_null(0)]
)
print(df)
shape: (3, 3)
┌──────┬──────┬──────┐
│ col1 ┆ col2 ┆ col3 │
│ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 │
╞══════╪══════╪══════╡
│ 1 ┆ 0 ┆ 7 │
│ 2 ┆ 5 ┆ 0 │
│ 3 ┆ 0 ┆ 9 │
└──────┴──────┴──────┘
Drop the first two characters from all elements in a DataFrame
df = pl.DataFrame({
"col1": ["ab123", "ab456", "ab789"],
"col2": ["cd001", "cd002", "cd003"]
})
# Drop the first two characters from all elements in the DataFrame
df = df.with_columns(
[pl.col(c).str.slice(2) for c in df.columns]
)
print(df)
# Now, do the same, but turn the resulting elements into integers.
df = pl.DataFrame({
"col1": ["ab123", "ab456", "ab789"],
"col2": ["cd001", "cd002", "cd003"]
})
# Drop the first two characters and convert the remaining strings to integers
df = df.with_columns(
[pl.col(c).str.slice(2).cast(pl.Int64) for c in df.columns]
)
print(df)
shape: (3, 2)
┌──────┬──────┐
│ col1 ┆ col2 │
│ --- ┆ --- │
│ str ┆ str │
╞══════╪══════╡
│ 123 ┆ 001 │
│ 456 ┆ 002 │
│ 789 ┆ 003 │
└──────┴──────┘
shape: (3, 2)
┌──────┬──────┐
│ col1 ┆ col2 │
│ --- ┆ --- │
│ i64 ┆ i64 │
╞══════╪══════╡
│ 123 ┆ 1 │
│ 456 ┆ 2 │
│ 789 ┆ 3 │
└──────┴──────┘
Drop a column by its index number
df = pl.DataFrame({
"col1": [1, 2, 3],
"col2": [4, 5, 6],
"col3": [7, 8, 9]
})
# Drop the first column by its index (0)
df = df.drop(df.columns[0])
print(df)
shape: (3, 2)
┌──────┬──────┐
│ col2 ┆ col3 │
│ --- ┆ --- │
│ i64 ┆ i64 │
╞══════╪══════╡
│ 4 ┆ 7 │
│ 5 ┆ 8 │
│ 6 ┆ 9 │
└──────┴──────┘
Reorder columns
df.select(['col3', 'col2'])
shape: (3, 2)
col3 | col2 |
---|---|
i64 | i64 |
7 | 4 |
8 | 5 |
9 | 6 |
df.select([pl.col('col3'), pl.col('col2')])
shape: (3, 2)
col3 | col2 |
---|---|
i64 | i64 |
7 | 4 |
8 | 5 |
9 | 6 |
Reorder DataFrame’s rows by matching a column with a list
df = pl.DataFrame({
"col1": ["b", "c", "a", "e", "d"],
"col2": [1, 2, 3, 4, 5]
})
# The desired order for "col1"
desired_order = ["a", "b", "c", "d", "e"]
# Reorder the DataFrame by matching "col1" with the desired_order list
df_reordered = df.with_columns(
pl.col("col1").map_elements(lambda x: desired_order.index(x), return_dtype=pl.Int64).alias("sort_key")
).sort("sort_key").drop("sort_key")
print(df_reordered)
# or
df_reordered = df.join(pl.DataFrame({"col1": desired_order}), on = "col1")
print(df_reordered)
shape: (5, 2)
┌──────┬──────┐
│ col1 ┆ col2 │
│ --- ┆ --- │
│ str ┆ i64 │
╞══════╪══════╡
│ a ┆ 3 │
│ b ┆ 1 │
│ c ┆ 2 │
│ d ┆ 5 │
│ e ┆ 4 │
└──────┴──────┘
shape: (5, 2)
┌──────┬──────┐
│ col1 ┆ col2 │
│ --- ┆ --- │
│ str ┆ i64 │
╞══════╪══════╡
│ a ┆ 3 │
│ b ┆ 1 │
│ c ┆ 2 │
│ d ┆ 5 │
│ e ┆ 4 │
└──────┴──────┘
Filter the DataFrame for elements containing a pattern in a column
df = pl.DataFrame({
"col1": ["abc123", "def456", "ghi789", "abcxyz"],
"col2": [1, 2, 3, 4]
})
# Filter rows where "col1" contains the pattern "abc"
filtered_df = df.filter(pl.col("col1").str.contains("abc"))
print(filtered_df)
shape: (2, 2)
┌────────┬──────┐
│ col1 ┆ col2 │
│ --- ┆ --- │
│ str ┆ i64 │
╞════════╪══════╡
│ abc123 ┆ 1 │
│ abcxyz ┆ 4 │
└────────┴──────┘
Compute the dot/inner product (crossproduct) between two Series
s = pl.Series("a", [1, 2, 3])
s2 = pl.Series("b", [4.0, 5.0, 6.0])
s.dot(s2)
32.0
Extract rows of a DataFrame in a list of tuples
print(df.rows())
[('abc123', 1), ('def456', 2), ('ghi789', 3), ('abcxyz', 4)]
Get rowSums
of a DataFrame
df = pl.DataFrame({
"col1": [1, 2, 3],
"col2": [4, 5, 6],
"col3": [7, 8, 9]
})
df = df.with_columns(df.select(pl.sum_horizontal("*").alias("row_sum")))
df.with_columns(df.select(pl.sum_horizontal(["col1","col2"]).alias("row_sum")))
print(df)
shape: (3, 4)
┌──────┬──────┬──────┬─────────┐
│ col1 ┆ col2 ┆ col3 ┆ row_sum │
│ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 ┆ i64 │
╞══════╪══════╪══════╪═════════╡
│ 1 ┆ 4 ┆ 7 ┆ 12 │
│ 2 ┆ 5 ┆ 8 ┆ 15 │
│ 3 ┆ 6 ┆ 9 ┆ 18 │
└──────┴──────┴──────┴─────────┘
head and tail
print(df.head())
print(df.head(2))
print(df.tail())
print(df.tail(2))
shape: (3, 4)
┌──────┬──────┬──────┬─────────┐
│ col1 ┆ col2 ┆ col3 ┆ row_sum │
│ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 ┆ i64 │
╞══════╪══════╪══════╪═════════╡
│ 1 ┆ 4 ┆ 7 ┆ 12 │
│ 2 ┆ 5 ┆ 8 ┆ 15 │
│ 3 ┆ 6 ┆ 9 ┆ 18 │
└──────┴──────┴──────┴─────────┘
shape: (2, 4)
┌──────┬──────┬──────┬─────────┐
│ col1 ┆ col2 ┆ col3 ┆ row_sum │
│ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 ┆ i64 │
╞══════╪══════╪══════╪═════════╡
│ 1 ┆ 4 ┆ 7 ┆ 12 │
│ 2 ┆ 5 ┆ 8 ┆ 15 │
└──────┴──────┴──────┴─────────┘
shape: (3, 4)
┌──────┬──────┬──────┬─────────┐
│ col1 ┆ col2 ┆ col3 ┆ row_sum │
│ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 ┆ i64 │
╞══════╪══════╪══════╪═════════╡
│ 1 ┆ 4 ┆ 7 ┆ 12 │
│ 2 ┆ 5 ┆ 8 ┆ 15 │
│ 3 ┆ 6 ┆ 9 ┆ 18 │
└──────┴──────┴──────┴─────────┘
shape: (2, 4)
┌──────┬──────┬──────┬─────────┐
│ col1 ┆ col2 ┆ col3 ┆ row_sum │
│ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 ┆ i64 │
╞══════╪══════╪══════╪═════════╡
│ 2 ┆ 5 ┆ 8 ┆ 15 │
│ 3 ┆ 6 ┆ 9 ┆ 18 │
└──────┴──────┴──────┴─────────┘
Matrix-vector multiplication
import numpy as np
a = pl.DataFrame({
"col1": [5,1,3,2],
"col2": [1,1,1,3],
"col3": [1,2,1,4]
})
b = [1, 2, 3]
print(a.to_numpy().dot(b))
[10 9 8 20]