Most of these commands are derived with the help of ChatGPT or stackoverflow. However, it is easier to look for a command here rather than searching for the answers to these questions over and over again.
There might be a few Python examples here not directly related to Polars!
Read . as missing value import polars as pl # df = pl.read_csv("your_file.csv", null_values=".", has_header = True) Replace . with missing value df = pl.DataFrame({ "col1": [".", "AB456", "GK789", "."], "col2": [10, 20, 30, 40], "col3": [1, 2, 3, 4] }) df = df.with_columns(pl.col(pl.String).replace({".": None})) print(df) shape: (4, 3)
┌───────┬──────┬──────┐
│ col1 ┆ col2 ┆ col3 │
│ --- ┆ --- ┆ --- │
│ str ┆ i64 ┆ i64 │
╞═══════╪══════╪══════╡
│ null ┆ 10 ┆ 1 │
│ AB456 ┆ 20 ┆ 2 │
│ GK789 ┆ 30 ┆ 3 │
│ null ┆ 40 ┆ 4 │
└───────┴──────┴──────┘ Get column names df = pl.DataFrame({ "col1": ["GK123", "AB456", "GK789", "CD012"], "col2": [10, 20, 30, 40], "col3": [1, 2, 3, 4] }) print(df.columns) # Get the last two columns print(df.columns[-2:]) # Get the first two columns print(df.columns[:2]) ['col1', 'col2', 'col3']
['col2', 'col3']
['col1', 'col2'] Change values in a column based on a condition in another column df = pl.DataFrame({ "col1": ["GK123", "AB456", "GK789", "CD012"], "col2": [10, 20, 30, 40] }) # Replace values in 'col2' based on the condition that 'col1' starts with 'GK' df = df.with_columns( pl.when(pl.col("col1").str.starts_with("GK")) .then(999) # Replace with your desired value .otherwise(pl.col("col2")) # Keep original value if condition is not met .alias("col2") ) print(df) shape: (4, 2)
┌───────┬──────┐
│ col1 ┆ col2 │
│ --- ┆ --- │
│ str ┆ i64 │
╞═══════╪══════╡
│ GK123 ┆ 999 │
│ AB456 ┆ 20 │
│ GK789 ┆ 999 │
│ CD012 ┆ 40 │
└───────┴──────┘ Change values in a column based on multiple conditions in another column df = pl.DataFrame({ "col1": ["GK123", "AB456", "GK789", "CD012"], "col2": [10, 25, 30, 40] }) # Replace values in 'col2' based on multiple conditions df = df.with_columns( pl.when((pl.col("col1").str.starts_with("GK")) & (pl.col("col2") > 20)) .then(999) # Replace with your desired value .otherwise(pl.col("col2")) # Keep original value if conditions are not met .alias("col2") ) print(df) df = pl.DataFrame({ "col1": ["GK123", "AB456", "GK789", "CD012"], "col2": [10, 25, 30, 40] }) df = df.with_columns( pl.when(pl.col("col2") < 11).then(20) .when(pl.col("col2") > 33).then(pl.col("col2") * 2) .otherwise(pl.col("col2")) .alias("col2") ) print(df) shape: (4, 2)
┌───────┬──────┐
│ col1 ┆ col2 │
│ --- ┆ --- │
│ str ┆ i64 │
╞═══════╪══════╡
│ GK123 ┆ 10 │
│ AB456 ┆ 25 │
│ GK789 ┆ 999 │
│ CD012 ┆ 40 │
└───────┴──────┘
shape: (4, 2)
┌───────┬──────┐
│ col1 ┆ col2 │
│ --- ┆ --- │
│ str ┆ i64 │
╞═══════╪══════╡
│ GK123 ┆ 20 │
│ AB456 ┆ 25 │
│ GK789 ┆ 30 │
│ CD012 ┆ 80 │
└───────┴──────┘ Extract a column into a vector col1_vector = df["col1"].to_list() print(col1_vector) ['GK123', 'AB456', 'GK789', 'CD012'] Extract values meeting a condition from a column df = pl.DataFrame({ "col1": [1, 2, 3], "col2": [4, 5, 6] }) # Extract values from 'col1' that are greater than 1 filtered_values = df.filter(pl.col("col1") > 1)["col1"].to_list() print(filtered_values) [2, 3] Get the dimension of the DataFrame df.shape (3, 2) Paste a text string to a range of numbers # Generate the sequence from 1001 to 1010 and concatenate "DK" with each value result = [f"DK{1000 + i}" for i in range(1, 11)] print(result) result = ["DK" + str(i) for i in range(1001, 1011)] print(result) ['DK1001', 'DK1002', 'DK1003', 'DK1004', 'DK1005', 'DK1006', 'DK1007', 'DK1008', 'DK1009', 'DK1010']
['DK1001', 'DK1002', 'DK1003', 'DK1004', 'DK1005', 'DK1006', 'DK1007', 'DK1008', 'DK1009', 'DK1010'] Remove Initial characters in a string list string_list = ["AB123", "CD456", "EF789", "GH012"] # Remove the first two characters from each string modified_list = [s[2:] for s in string_list] print(modified_list) ['123', '456', '789', '012'] Copy a column into another when a condition is met df = pl.DataFrame({ "col1": ["GK123", "AB456", "GK789", "CD012"], "col2": [10, 20, 30, 40] }) # Copy 'col2' to 'col1' where 'col1' starts with 'GK' df = df.with_columns( pl.when(pl.col("col1").str.starts_with("GK")) .then(pl.col("col2")) # Copy 'col2' value to 'col1' .otherwise(pl.col("col1")) # Keep original 'col1' value if condition is not met .alias("col1") ) print(df) shape: (4, 2)
┌───────┬──────┐
│ col1 ┆ col2 │
│ --- ┆ --- │
│ str ┆ i64 │
╞═══════╪══════╡
│ 10 ┆ 10 │
│ AB456 ┆ 20 │
│ 30 ┆ 30 │
│ CD012 ┆ 40 │
└───────┴──────┘ Copy a list into a DataFrame column when a condition is met df = pl.DataFrame({ "col1": ["GK123", "AB456", "GK789", "CD012"], "col2": [10, 20, 30, 40] }) new_values = ["XY987", "ZW654", "XY987", "ZW654"] # Copy 'new_values' to 'col1' where 'col1' starts with 'GK' df = df.with_columns( pl.when(pl.col("col1").str.starts_with("GK")) .then(pl.Series(new_values)) # Assign new values with the same shape .otherwise(pl.col("col1")) .alias("col1") ) print(df) shape: (4, 2)
┌───────┬──────┐
│ col1 ┆ col2 │
│ --- ┆ --- │
│ str ┆ i64 │
╞═══════╪══════╡
│ XY987 ┆ 10 │
│ AB456 ┆ 20 │
│ XY987 ┆ 30 │
│ CD012 ┆ 40 │
└───────┴──────┘ Copy a list into a column df = df.with_columns(pl.Series("col1", new_values)) print(df) df = pl.DataFrame({ "col1": ["GK123", "AB456", "GK789", "CD012"], "col2": [10, 20, 30, 40] }) df = df.with_columns(col1 = pl.lit(pl.Series(new_values))) print(df) shape: (4, 2)
┌───────┬──────┐
│ col1 ┆ col2 │
│ --- ┆ --- │
│ str ┆ i64 │
╞═══════╪══════╡
│ XY987 ┆ 10 │
│ ZW654 ┆ 20 │
│ XY987 ┆ 30 │
│ ZW654 ┆ 40 │
└───────┴──────┘
shape: (4, 2)
┌───────┬──────┐
│ col1 ┆ col2 │
│ --- ┆ --- │
│ str ┆ i64 │
╞═══════╪══════╡
│ XY987 ┆ 10 │
│ ZW654 ┆ 20 │
│ XY987 ┆ 30 │
│ ZW654 ┆ 40 │
└───────┴──────┘ Copy row-sums (rowSums) of columns “a” and “b” into column “a” df = pl.DataFrame({ "a": [1, 2, 3], "b": [4, 5, 6] }) # Compute the row-wise sum of columns "a" and "b" and store it in column "a" df = df.with_columns( (pl.col("a") + pl.col("b")).alias("a") ) print(df) shape: (3, 2)
┌─────┬─────┐
│ a ┆ b │
│ --- ┆ --- │
│ i64 ┆ i64 │
╞═════╪═════╡
│ 5 ┆ 4 │
│ 7 ┆ 5 │
│ 9 ┆ 6 │
└─────┴─────┘ Drop columns with the column name ending with a pattern df = pl.DataFrame({ "col1": [1, 2, 3], "col2_right": [4, 5, 6], "col3": [7, 8, 9], "col4_right": [10, 11, 12] }) # Drop columns that end with "_right" df = df.select([col for col in df.columns if not col.endswith("_right")]) print(df) shape: (3, 2)
┌──────┬──────┐
│ col1 ┆ col3 │
│ --- ┆ --- │
│ i64 ┆ i64 │
╞══════╪══════╡
│ 1 ┆ 7 │
│ 2 ┆ 8 │
│ 3 ┆ 9 │
└──────┴──────┘ Rename columns with the column name ending with a pattern df = pl.DataFrame({ "col1": [1, 2, 3], "col2_right": [4, 5, 6], "col3": [7, 8, 9], "col4_right": [10, 11, 12] }) # Rename columns ending with "_right" to "d*" (d1, d2, ...) new_names = {} counter = 1 for col in df.columns: if col.endswith("_right"): new_names[col] = f"d{counter}" counter += 1 # Apply the renaming df = df.rename(new_names) print(df) df = pl.DataFrame({ "col1": [1, 2, 3], "col2_right": [4, 5, 6], "col3": [7, 8, 9], "col4_right": [10, 11, 12] }) # Create a dictionary to hold the renaming mappings new_names = {} # Iterate through the columns and rename those ending with "_right" for col in df.columns: if col.endswith("_right"): # Extract the part before "_right" and prepend "d" base_name = col.split("_right")[0] new_names[col] = f"d{base_name}" # Apply the renaming df = df.rename(new_names) print(df) shape: (3, 4)
┌──────┬─────┬──────┬─────┐
│ col1 ┆ d1 ┆ col3 ┆ d2 │
│ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 ┆ i64 │
╞══════╪═════╪══════╪═════╡
│ 1 ┆ 4 ┆ 7 ┆ 10 │
│ 2 ┆ 5 ┆ 8 ┆ 11 │
│ 3 ┆ 6 ┆ 9 ┆ 12 │
└──────┴─────┴──────┴─────┘
shape: (3, 4)
┌──────┬───────┬──────┬───────┐
│ col1 ┆ dcol2 ┆ col3 ┆ dcol4 │
│ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 ┆ i64 │
╞══════╪═══════╪══════╪═══════╡
│ 1 ┆ 4 ┆ 7 ┆ 10 │
│ 2 ┆ 5 ┆ 8 ┆ 11 │
│ 3 ┆ 6 ┆ 9 ┆ 12 │
└──────┴───────┴──────┴───────┘ Rename columns based on column index df = pl.DataFrame({ "original1": ["123-456", "789-012", "345-678"], "original2": [1, 2, 3], "original3": [10, 20, 30] }) # Get the first two column names old_names = df.columns[:2] # New names for the first two columns new_names = ["col1", "col2"] # Rename the first two columns df = df.rename({old_names[0]: new_names[0], old_names[1]: new_names[1]}) print(df) shape: (3, 3)
┌─────────┬──────┬───────────┐
│ col1 ┆ col2 ┆ original3 │
│ --- ┆ --- ┆ --- │
│ str ┆ i64 ┆ i64 │
╞═════════╪══════╪═══════════╡
│ 123-456 ┆ 1 ┆ 10 │
│ 789-012 ┆ 2 ┆ 20 │
│ 345-678 ┆ 3 ┆ 30 │
└─────────┴──────┴───────────┘ Rename all columns with a list df = pl.DataFrame({ "old_name1": [1, 2, 3], "old_name2": [4, 5, 6], "old_name3": [7, 8, 9] }) # List of new column names new_column_names = ["new_name1", "new_name2", "new_name3"] # Rename all columns df = df.rename({old: new for old, new in zip(df.columns, new_column_names)}) print(df) shape: (3, 3)
┌───────────┬───────────┬───────────┐
│ new_name1 ┆ new_name2 ┆ new_name3 │
│ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 │
╞═══════════╪═══════════╪═══════════╡
│ 1 ┆ 4 ┆ 7 │
│ 2 ┆ 5 ┆ 8 │
│ 3 ┆ 6 ┆ 9 │
└───────────┴───────────┴───────────┘ The difference between .str.replace and .str.replace_all df = pl.DataFrame({ "col1": ["123-456-34", "789-012-78", "345-678-02"], "col2": [1, 2, 3] }) # Replace "-" with "" in the "col1" column df = df.with_columns( pl.col("col1").str.replace("-", "").alias("col1") ) print(df) df = pl.DataFrame({ "col1": ["123-456", "789-012", "345-678"], "col2": [1, 2, 3] }) # Replace "-" with "" in the "col1" column df = df.with_columns( pl.col("col1").str.replace_all("-", "").alias("col1") ) print(df) shape: (3, 2)
┌───────────┬──────┐
│ col1 ┆ col2 │
│ --- ┆ --- │
│ str ┆ i64 │
╞═══════════╪══════╡
│ 123456-34 ┆ 1 │
│ 789012-78 ┆ 2 │
│ 345678-02 ┆ 3 │
└───────────┴──────┘
shape: (3, 2)
┌────────┬──────┐
│ col1 ┆ col2 │
│ --- ┆ --- │
│ str ┆ i64 │
╞════════╪══════╡
│ 123456 ┆ 1 │
│ 789012 ┆ 2 │
│ 345678 ┆ 3 │
└────────┴──────┘ Read specific columns from a CSV file # df = pl.read_csv("data.csv", columns=[0, 2]) Replace null with 0 in the last two columns df = pl.DataFrame({ "col1": [1, 2, 3], "col2": [None, 5, None], "col3": [7, None, 9] }) # Replace null with 0 in the last two columns df = df.with_columns( [pl.col(df.columns[-2:]).fill_null(0)] ) print(df) shape: (3, 3)
┌──────┬──────┬──────┐
│ col1 ┆ col2 ┆ col3 │
│ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 │
╞══════╪══════╪══════╡
│ 1 ┆ 0 ┆ 7 │
│ 2 ┆ 5 ┆ 0 │
│ 3 ┆ 0 ┆ 9 │
└──────┴──────┴──────┘ Drop the first two characters from all elements in a DataFrame df = pl.DataFrame({ "col1": ["ab123", "ab456", "ab789"], "col2": ["cd001", "cd002", "cd003"] }) # Drop the first two characters from all elements in the DataFrame df = df.with_columns( [pl.col(c).str.slice(2) for c in df.columns] ) print(df) # Now, do the same, but turn the resulting elements into integers. df = pl.DataFrame({ "col1": ["ab123", "ab456", "ab789"], "col2": ["cd001", "cd002", "cd003"] }) # Drop the first two characters and convert the remaining strings to integers df = df.with_columns( [pl.col(c).str.slice(2).cast(pl.Int64) for c in df.columns] ) print(df) shape: (3, 2)
┌──────┬──────┐
│ col1 ┆ col2 │
│ --- ┆ --- │
│ str ┆ str │
╞══════╪══════╡
│ 123 ┆ 001 │
│ 456 ┆ 002 │
│ 789 ┆ 003 │
└──────┴──────┘
shape: (3, 2)
┌──────┬──────┐
│ col1 ┆ col2 │
│ --- ┆ --- │
│ i64 ┆ i64 │
╞══════╪══════╡
│ 123 ┆ 1 │
│ 456 ┆ 2 │
│ 789 ┆ 3 │
└──────┴──────┘ Drop a column by its index number df = pl.DataFrame({ "col1": [1, 2, 3], "col2": [4, 5, 6], "col3": [7, 8, 9] }) # Drop the first column by its index (0) df = df.drop(df.columns[0]) print(df) shape: (3, 2)
┌──────┬──────┐
│ col2 ┆ col3 │
│ --- ┆ --- │
│ i64 ┆ i64 │
╞══════╪══════╡
│ 4 ┆ 7 │
│ 5 ┆ 8 │
│ 6 ┆ 9 │
└──────┴──────┘ Reorder columns df.select(['col3', 'col2']) shape: (3, 2)col3col2i64i64748596
df.select([pl.col('col3'), pl.col('col2')]) shape: (3, 2)col3col2i64i64748596
Reorder DataFrame’s rows by matching a column with a list df = pl.DataFrame({ "col1": ["b", "c", "a", "e", "d"], "col2": [1, 2, 3, 4, 5] }) # The desired order for "col1" desired_order = ["a", "b", "c", "d", "e"] # Reorder the DataFrame by matching "col1" with the desired_order list df_reordered = df.with_columns( pl.col("col1").map_elements(lambda x: desired_order.index(x), return_dtype=pl.Int64).alias("sort_key") ).sort("sort_key").drop("sort_key") print(df_reordered) # or df_reordered = df.join(pl.DataFrame({"col1": desired_order}), on = "col1") print(df_reordered) shape: (5, 2)
┌──────┬──────┐
│ col1 ┆ col2 │
│ --- ┆ --- │
│ str ┆ i64 │
╞══════╪══════╡
│ a ┆ 3 │
│ b ┆ 1 │
│ c ┆ 2 │
│ d ┆ 5 │
│ e ┆ 4 │
└──────┴──────┘
shape: (5, 2)
┌──────┬──────┐
│ col1 ┆ col2 │
│ --- ┆ --- │
│ str ┆ i64 │
╞══════╪══════╡
│ a ┆ 3 │
│ b ┆ 1 │
│ c ┆ 2 │
│ d ┆ 5 │
│ e ┆ 4 │
└──────┴──────┘ Filter the DataFrame for elements containing a pattern in a column df = pl.DataFrame({ "col1": ["abc123", "def456", "ghi789", "abcxyz"], "col2": [1, 2, 3, 4] }) # Filter rows where "col1" contains the pattern "abc" filtered_df = df.filter(pl.col("col1").str.contains("abc")) print(filtered_df) shape: (2, 2)
┌────────┬──────┐
│ col1 ┆ col2 │
│ --- ┆ --- │
│ str ┆ i64 │
╞════════╪══════╡
│ abc123 ┆ 1 │
│ abcxyz ┆ 4 │
└────────┴──────┘ Compute the dot/inner product (crossproduct) between two Series s = pl.Series("a", [1, 2, 3]) s2 = pl.Series("b", [4.0, 5.0, 6.0]) s.dot(s2) 32.0 Extract rows of a DataFrame in a list of tuples print(df.rows()) [('abc123', 1), ('def456', 2), ('ghi789', 3), ('abcxyz', 4)] Get rowSums of a DataFrame df = pl.DataFrame({ "col1": [1, 2, 3], "col2": [4, 5, 6], "col3": [7, 8, 9] }) df = df.with_columns(df.select(pl.sum_horizontal("*").alias("row_sum"))) df.with_columns(df.select(pl.sum_horizontal(["col1","col2"]).alias("row_sum"))) print(df) shape: (3, 4)
┌──────┬──────┬──────┬─────────┐
│ col1 ┆ col2 ┆ col3 ┆ row_sum │
│ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 ┆ i64 │
╞══════╪══════╪══════╪═════════╡
│ 1 ┆ 4 ┆ 7 ┆ 12 │
│ 2 ┆ 5 ┆ 8 ┆ 15 │
│ 3 ┆ 6 ┆ 9 ┆ 18 │
└──────┴──────┴──────┴─────────┘ head and tail print(df.head()) print(df.head(2)) print(df.tail()) print(df.tail(2)) shape: (3, 4)
┌──────┬──────┬──────┬─────────┐
│ col1 ┆ col2 ┆ col3 ┆ row_sum │
│ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 ┆ i64 │
╞══════╪══════╪══════╪═════════╡
│ 1 ┆ 4 ┆ 7 ┆ 12 │
│ 2 ┆ 5 ┆ 8 ┆ 15 │
│ 3 ┆ 6 ┆ 9 ┆ 18 │
└──────┴──────┴──────┴─────────┘
shape: (2, 4)
┌──────┬──────┬──────┬─────────┐
│ col1 ┆ col2 ┆ col3 ┆ row_sum │
│ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 ┆ i64 │
╞══════╪══════╪══════╪═════════╡
│ 1 ┆ 4 ┆ 7 ┆ 12 │
│ 2 ┆ 5 ┆ 8 ┆ 15 │
└──────┴──────┴──────┴─────────┘
shape: (3, 4)
┌──────┬──────┬──────┬─────────┐
│ col1 ┆ col2 ┆ col3 ┆ row_sum │
│ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 ┆ i64 │
╞══════╪══════╪══════╪═════════╡
│ 1 ┆ 4 ┆ 7 ┆ 12 │
│ 2 ┆ 5 ┆ 8 ┆ 15 │
│ 3 ┆ 6 ┆ 9 ┆ 18 │
└──────┴──────┴──────┴─────────┘
shape: (2, 4)
┌──────┬──────┬──────┬─────────┐
│ col1 ┆ col2 ┆ col3 ┆ row_sum │
│ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 ┆ i64 │
╞══════╪══════╪══════╪═════════╡
│ 2 ┆ 5 ┆ 8 ┆ 15 │
│ 3 ┆ 6 ┆ 9 ┆ 18 │
└──────┴──────┴──────┴─────────┘ Matrix-vector multiplication import numpy as np a = pl.DataFrame({ "col1": [5,1,3,2], "col2": [1,1,1,3], "col3": [1,2,1,4] }) b = [1, 2, 3] print(a.to_numpy().dot(b)) [10 9 8 20]