Python

Most of these commands are derived with the help of ChatGPT or stackoverflow. However, it is easier to look for a command here rather than searching for the answers to these questions over and over again. There might be a few Python examples here not directly related to Polars! Read . as missing value import polars as pl # df = pl.read_csv("your_file.csv", null_values=".", has_header = True) Replace . with missing value df = pl.DataFrame({ "col1": [".", "AB456", "GK789", "."], "col2": [10, 20, 30, 40], "col3": [1, 2, 3, 4] }) df = df.with_columns(pl.col(pl.String).replace({".": None})) print(df) shape: (4, 3) ┌───────┬──────┬──────┐ │ col1 ┆ col2 ┆ col3 │ │ --- ┆ --- ┆ --- │ │ str ┆ i64 ┆ i64 │ ╞═══════╪══════╪══════╡ │ null ┆ 10 ┆ 1 │ │ AB456 ┆ 20 ┆ 2 │ │ GK789 ┆ 30 ┆ 3 │ │ null ┆ 40 ┆ 4 │ └───────┴──────┴──────┘ Get column names df = pl.DataFrame({ "col1": ["GK123", "AB456", "GK789", "CD012"], "col2": [10, 20, 30, 40], "col3": [1, 2, 3, 4] }) print(df.columns) # Get the last two columns print(df.columns[-2:]) # Get the first two columns print(df.columns[:2]) ['col1', 'col2', 'col3'] ['col2', 'col3'] ['col1', 'col2'] Change values in a column based on a condition in another column df = pl.DataFrame({ "col1": ["GK123", "AB456", "GK789", "CD012"], "col2": [10, 20, 30, 40] }) # Replace values in 'col2' based on the condition that 'col1' starts with 'GK' df = df.with_columns( pl.when(pl.col("col1").str.starts_with("GK")) .then(999) # Replace with your desired value .otherwise(pl.col("col2")) # Keep original value if condition is not met .alias("col2") ) print(df) shape: (4, 2) ┌───────┬──────┐ │ col1 ┆ col2 │ │ --- ┆ --- │ │ str ┆ i64 │ ╞═══════╪══════╡ │ GK123 ┆ 999 │ │ AB456 ┆ 20 │ │ GK789 ┆ 999 │ │ CD012 ┆ 40 │ └───────┴──────┘ Change values in a column based on multiple conditions in another column df = pl.DataFrame({ "col1": ["GK123", "AB456", "GK789", "CD012"], "col2": [10, 25, 30, 40] }) # Replace values in 'col2' based on multiple conditions df = df.with_columns( pl.when((pl.col("col1").str.starts_with("GK")) & (pl.col("col2") > 20)) .then(999) # Replace with your desired value .otherwise(pl.col("col2")) # Keep original value if conditions are not met .alias("col2") ) print(df) df = pl.DataFrame({ "col1": ["GK123", "AB456", "GK789", "CD012"], "col2": [10, 25, 30, 40] }) df = df.with_columns( pl.when(pl.col("col2") < 11).then(20) .when(pl.col("col2") > 33).then(pl.col("col2") * 2) .otherwise(pl.col("col2")) .alias("col2") ) print(df) shape: (4, 2) ┌───────┬──────┐ │ col1 ┆ col2 │ │ --- ┆ --- │ │ str ┆ i64 │ ╞═══════╪══════╡ │ GK123 ┆ 10 │ │ AB456 ┆ 25 │ │ GK789 ┆ 999 │ │ CD012 ┆ 40 │ └───────┴──────┘ shape: (4, 2) ┌───────┬──────┐ │ col1 ┆ col2 │ │ --- ┆ --- │ │ str ┆ i64 │ ╞═══════╪══════╡ │ GK123 ┆ 20 │ │ AB456 ┆ 25 │ │ GK789 ┆ 30 │ │ CD012 ┆ 80 │ └───────┴──────┘ Extract a column into a vector col1_vector = df["col1"].to_list() print(col1_vector) ['GK123', 'AB456', 'GK789', 'CD012'] Extract values meeting a condition from a column df = pl.DataFrame({ "col1": [1, 2, 3], "col2": [4, 5, 6] }) # Extract values from 'col1' that are greater than 1 filtered_values = df.filter(pl.col("col1") > 1)["col1"].to_list() print(filtered_values) [2, 3] Get the dimension of the DataFrame df.shape (3, 2) Paste a text string to a range of numbers # Generate the sequence from 1001 to 1010 and concatenate "DK" with each value result = [f"DK{1000 + i}" for i in range(1, 11)] print(result) result = ["DK" + str(i) for i in range(1001, 1011)] print(result) ['DK1001', 'DK1002', 'DK1003', 'DK1004', 'DK1005', 'DK1006', 'DK1007', 'DK1008', 'DK1009', 'DK1010'] ['DK1001', 'DK1002', 'DK1003', 'DK1004', 'DK1005', 'DK1006', 'DK1007', 'DK1008', 'DK1009', 'DK1010'] Remove Initial characters in a string list string_list = ["AB123", "CD456", "EF789", "GH012"] # Remove the first two characters from each string modified_list = [s[2:] for s in string_list] print(modified_list) ['123', '456', '789', '012'] Copy a column into another when a condition is met df = pl.DataFrame({ "col1": ["GK123", "AB456", "GK789", "CD012"], "col2": [10, 20, 30, 40] }) # Copy 'col2' to 'col1' where 'col1' starts with 'GK' df = df.with_columns( pl.when(pl.col("col1").str.starts_with("GK")) .then(pl.col("col2")) # Copy 'col2' value to 'col1' .otherwise(pl.col("col1")) # Keep original 'col1' value if condition is not met .alias("col1") ) print(df) shape: (4, 2) ┌───────┬──────┐ │ col1 ┆ col2 │ │ --- ┆ --- │ │ str ┆ i64 │ ╞═══════╪══════╡ │ 10 ┆ 10 │ │ AB456 ┆ 20 │ │ 30 ┆ 30 │ │ CD012 ┆ 40 │ └───────┴──────┘ Copy a list into a DataFrame column when a condition is met df = pl.DataFrame({ "col1": ["GK123", "AB456", "GK789", "CD012"], "col2": [10, 20, 30, 40] }) new_values = ["XY987", "ZW654", "XY987", "ZW654"] # Copy 'new_values' to 'col1' where 'col1' starts with 'GK' df = df.with_columns( pl.when(pl.col("col1").str.starts_with("GK")) .then(pl.Series(new_values)) # Assign new values with the same shape .otherwise(pl.col("col1")) .alias("col1") ) print(df) shape: (4, 2) ┌───────┬──────┐ │ col1 ┆ col2 │ │ --- ┆ --- │ │ str ┆ i64 │ ╞═══════╪══════╡ │ XY987 ┆ 10 │ │ AB456 ┆ 20 │ │ XY987 ┆ 30 │ │ CD012 ┆ 40 │ └───────┴──────┘ Copy a list into a column df = df.with_columns(pl.Series("col1", new_values)) print(df) df = pl.DataFrame({ "col1": ["GK123", "AB456", "GK789", "CD012"], "col2": [10, 20, 30, 40] }) df = df.with_columns(col1 = pl.lit(pl.Series(new_values))) print(df) shape: (4, 2) ┌───────┬──────┐ │ col1 ┆ col2 │ │ --- ┆ --- │ │ str ┆ i64 │ ╞═══════╪══════╡ │ XY987 ┆ 10 │ │ ZW654 ┆ 20 │ │ XY987 ┆ 30 │ │ ZW654 ┆ 40 │ └───────┴──────┘ shape: (4, 2) ┌───────┬──────┐ │ col1 ┆ col2 │ │ --- ┆ --- │ │ str ┆ i64 │ ╞═══════╪══════╡ │ XY987 ┆ 10 │ │ ZW654 ┆ 20 │ │ XY987 ┆ 30 │ │ ZW654 ┆ 40 │ └───────┴──────┘ Copy row-sums (rowSums) of columns “a” and “b” into column “a” df = pl.DataFrame({ "a": [1, 2, 3], "b": [4, 5, 6] }) # Compute the row-wise sum of columns "a" and "b" and store it in column "a" df = df.with_columns( (pl.col("a") + pl.col("b")).alias("a") ) print(df) shape: (3, 2) ┌─────┬─────┐ │ a ┆ b │ │ --- ┆ --- │ │ i64 ┆ i64 │ ╞═════╪═════╡ │ 5 ┆ 4 │ │ 7 ┆ 5 │ │ 9 ┆ 6 │ └─────┴─────┘ Drop columns with the column name ending with a pattern df = pl.DataFrame({ "col1": [1, 2, 3], "col2_right": [4, 5, 6], "col3": [7, 8, 9], "col4_right": [10, 11, 12] }) # Drop columns that end with "_right" df = df.select([col for col in df.columns if not col.endswith("_right")]) print(df) shape: (3, 2) ┌──────┬──────┐ │ col1 ┆ col3 │ │ --- ┆ --- │ │ i64 ┆ i64 │ ╞══════╪══════╡ │ 1 ┆ 7 │ │ 2 ┆ 8 │ │ 3 ┆ 9 │ └──────┴──────┘ Rename columns with the column name ending with a pattern df = pl.DataFrame({ "col1": [1, 2, 3], "col2_right": [4, 5, 6], "col3": [7, 8, 9], "col4_right": [10, 11, 12] }) # Rename columns ending with "_right" to "d*" (d1, d2, ...) new_names = {} counter = 1 for col in df.columns: if col.endswith("_right"): new_names[col] = f"d{counter}" counter += 1 # Apply the renaming df = df.rename(new_names) print(df) df = pl.DataFrame({ "col1": [1, 2, 3], "col2_right": [4, 5, 6], "col3": [7, 8, 9], "col4_right": [10, 11, 12] }) # Create a dictionary to hold the renaming mappings new_names = {} # Iterate through the columns and rename those ending with "_right" for col in df.columns: if col.endswith("_right"): # Extract the part before "_right" and prepend "d" base_name = col.split("_right")[0] new_names[col] = f"d{base_name}" # Apply the renaming df = df.rename(new_names) print(df) shape: (3, 4) ┌──────┬─────┬──────┬─────┐ │ col1 ┆ d1 ┆ col3 ┆ d2 │ │ --- ┆ --- ┆ --- ┆ --- │ │ i64 ┆ i64 ┆ i64 ┆ i64 │ ╞══════╪═════╪══════╪═════╡ │ 1 ┆ 4 ┆ 7 ┆ 10 │ │ 2 ┆ 5 ┆ 8 ┆ 11 │ │ 3 ┆ 6 ┆ 9 ┆ 12 │ └──────┴─────┴──────┴─────┘ shape: (3, 4) ┌──────┬───────┬──────┬───────┐ │ col1 ┆ dcol2 ┆ col3 ┆ dcol4 │ │ --- ┆ --- ┆ --- ┆ --- │ │ i64 ┆ i64 ┆ i64 ┆ i64 │ ╞══════╪═══════╪══════╪═══════╡ │ 1 ┆ 4 ┆ 7 ┆ 10 │ │ 2 ┆ 5 ┆ 8 ┆ 11 │ │ 3 ┆ 6 ┆ 9 ┆ 12 │ └──────┴───────┴──────┴───────┘ Rename columns based on column index df = pl.DataFrame({ "original1": ["123-456", "789-012", "345-678"], "original2": [1, 2, 3], "original3": [10, 20, 30] }) # Get the first two column names old_names = df.columns[:2] # New names for the first two columns new_names = ["col1", "col2"] # Rename the first two columns df = df.rename({old_names[0]: new_names[0], old_names[1]: new_names[1]}) print(df) shape: (3, 3) ┌─────────┬──────┬───────────┐ │ col1 ┆ col2 ┆ original3 │ │ --- ┆ --- ┆ --- │ │ str ┆ i64 ┆ i64 │ ╞═════════╪══════╪═══════════╡ │ 123-456 ┆ 1 ┆ 10 │ │ 789-012 ┆ 2 ┆ 20 │ │ 345-678 ┆ 3 ┆ 30 │ └─────────┴──────┴───────────┘ Rename all columns with a list df = pl.DataFrame({ "old_name1": [1, 2, 3], "old_name2": [4, 5, 6], "old_name3": [7, 8, 9] }) # List of new column names new_column_names = ["new_name1", "new_name2", "new_name3"] # Rename all columns df = df.rename({old: new for old, new in zip(df.columns, new_column_names)}) print(df) shape: (3, 3) ┌───────────┬───────────┬───────────┐ │ new_name1 ┆ new_name2 ┆ new_name3 │ │ --- ┆ --- ┆ --- │ │ i64 ┆ i64 ┆ i64 │ ╞═══════════╪═══════════╪═══════════╡ │ 1 ┆ 4 ┆ 7 │ │ 2 ┆ 5 ┆ 8 │ │ 3 ┆ 6 ┆ 9 │ └───────────┴───────────┴───────────┘ The difference between .str.replace and .str.replace_all df = pl.DataFrame({ "col1": ["123-456-34", "789-012-78", "345-678-02"], "col2": [1, 2, 3] }) # Replace "-" with "" in the "col1" column df = df.with_columns( pl.col("col1").str.replace("-", "").alias("col1") ) print(df) df = pl.DataFrame({ "col1": ["123-456", "789-012", "345-678"], "col2": [1, 2, 3] }) # Replace "-" with "" in the "col1" column df = df.with_columns( pl.col("col1").str.replace_all("-", "").alias("col1") ) print(df) shape: (3, 2) ┌───────────┬──────┐ │ col1 ┆ col2 │ │ --- ┆ --- │ │ str ┆ i64 │ ╞═══════════╪══════╡ │ 123456-34 ┆ 1 │ │ 789012-78 ┆ 2 │ │ 345678-02 ┆ 3 │ └───────────┴──────┘ shape: (3, 2) ┌────────┬──────┐ │ col1 ┆ col2 │ │ --- ┆ --- │ │ str ┆ i64 │ ╞════════╪══════╡ │ 123456 ┆ 1 │ │ 789012 ┆ 2 │ │ 345678 ┆ 3 │ └────────┴──────┘ Read specific columns from a CSV file # df = pl.read_csv("data.csv", columns=[0, 2]) Replace null with 0 in the last two columns df = pl.DataFrame({ "col1": [1, 2, 3], "col2": [None, 5, None], "col3": [7, None, 9] }) # Replace null with 0 in the last two columns df = df.with_columns( [pl.col(df.columns[-2:]).fill_null(0)] ) print(df) shape: (3, 3) ┌──────┬──────┬──────┐ │ col1 ┆ col2 ┆ col3 │ │ --- ┆ --- ┆ --- │ │ i64 ┆ i64 ┆ i64 │ ╞══════╪══════╪══════╡ │ 1 ┆ 0 ┆ 7 │ │ 2 ┆ 5 ┆ 0 │ │ 3 ┆ 0 ┆ 9 │ └──────┴──────┴──────┘ Drop the first two characters from all elements in a DataFrame df = pl.DataFrame({ "col1": ["ab123", "ab456", "ab789"], "col2": ["cd001", "cd002", "cd003"] }) # Drop the first two characters from all elements in the DataFrame df = df.with_columns( [pl.col(c).str.slice(2) for c in df.columns] ) print(df) # Now, do the same, but turn the resulting elements into integers. df = pl.DataFrame({ "col1": ["ab123", "ab456", "ab789"], "col2": ["cd001", "cd002", "cd003"] }) # Drop the first two characters and convert the remaining strings to integers df = df.with_columns( [pl.col(c).str.slice(2).cast(pl.Int64) for c in df.columns] ) print(df) shape: (3, 2) ┌──────┬──────┐ │ col1 ┆ col2 │ │ --- ┆ --- │ │ str ┆ str │ ╞══════╪══════╡ │ 123 ┆ 001 │ │ 456 ┆ 002 │ │ 789 ┆ 003 │ └──────┴──────┘ shape: (3, 2) ┌──────┬──────┐ │ col1 ┆ col2 │ │ --- ┆ --- │ │ i64 ┆ i64 │ ╞══════╪══════╡ │ 123 ┆ 1 │ │ 456 ┆ 2 │ │ 789 ┆ 3 │ └──────┴──────┘ Drop a column by its index number df = pl.DataFrame({ "col1": [1, 2, 3], "col2": [4, 5, 6], "col3": [7, 8, 9] }) # Drop the first column by its index (0) df = df.drop(df.columns[0]) print(df) shape: (3, 2) ┌──────┬──────┐ │ col2 ┆ col3 │ │ --- ┆ --- │ │ i64 ┆ i64 │ ╞══════╪══════╡ │ 4 ┆ 7 │ │ 5 ┆ 8 │ │ 6 ┆ 9 │ └──────┴──────┘ Reorder columns df.select(['col3', 'col2']) shape: (3, 2)col3col2i64i64748596 df.select([pl.col('col3'), pl.col('col2')]) shape: (3, 2)col3col2i64i64748596 Reorder DataFrame’s rows by matching a column with a list df = pl.DataFrame({ "col1": ["b", "c", "a", "e", "d"], "col2": [1, 2, 3, 4, 5] }) # The desired order for "col1" desired_order = ["a", "b", "c", "d", "e"] # Reorder the DataFrame by matching "col1" with the desired_order list df_reordered = df.with_columns( pl.col("col1").map_elements(lambda x: desired_order.index(x), return_dtype=pl.Int64).alias("sort_key") ).sort("sort_key").drop("sort_key") print(df_reordered) # or df_reordered = df.join(pl.DataFrame({"col1": desired_order}), on = "col1") print(df_reordered) shape: (5, 2) ┌──────┬──────┐ │ col1 ┆ col2 │ │ --- ┆ --- │ │ str ┆ i64 │ ╞══════╪══════╡ │ a ┆ 3 │ │ b ┆ 1 │ │ c ┆ 2 │ │ d ┆ 5 │ │ e ┆ 4 │ └──────┴──────┘ shape: (5, 2) ┌──────┬──────┐ │ col1 ┆ col2 │ │ --- ┆ --- │ │ str ┆ i64 │ ╞══════╪══════╡ │ a ┆ 3 │ │ b ┆ 1 │ │ c ┆ 2 │ │ d ┆ 5 │ │ e ┆ 4 │ └──────┴──────┘ Filter the DataFrame for elements containing a pattern in a column df = pl.DataFrame({ "col1": ["abc123", "def456", "ghi789", "abcxyz"], "col2": [1, 2, 3, 4] }) # Filter rows where "col1" contains the pattern "abc" filtered_df = df.filter(pl.col("col1").str.contains("abc")) print(filtered_df) shape: (2, 2) ┌────────┬──────┐ │ col1 ┆ col2 │ │ --- ┆ --- │ │ str ┆ i64 │ ╞════════╪══════╡ │ abc123 ┆ 1 │ │ abcxyz ┆ 4 │ └────────┴──────┘ Compute the dot/inner product (crossproduct) between two Series s = pl.Series("a", [1, 2, 3]) s2 = pl.Series("b", [4.0, 5.0, 6.0]) s.dot(s2) 32.0 Extract rows of a DataFrame in a list of tuples print(df.rows()) [('abc123', 1), ('def456', 2), ('ghi789', 3), ('abcxyz', 4)] Get rowSums of a DataFrame df = pl.DataFrame({ "col1": [1, 2, 3], "col2": [4, 5, 6], "col3": [7, 8, 9] }) df = df.with_columns(df.select(pl.sum_horizontal("*").alias("row_sum"))) df.with_columns(df.select(pl.sum_horizontal(["col1","col2"]).alias("row_sum"))) print(df) shape: (3, 4) ┌──────┬──────┬──────┬─────────┐ │ col1 ┆ col2 ┆ col3 ┆ row_sum │ │ --- ┆ --- ┆ --- ┆ --- │ │ i64 ┆ i64 ┆ i64 ┆ i64 │ ╞══════╪══════╪══════╪═════════╡ │ 1 ┆ 4 ┆ 7 ┆ 12 │ │ 2 ┆ 5 ┆ 8 ┆ 15 │ │ 3 ┆ 6 ┆ 9 ┆ 18 │ └──────┴──────┴──────┴─────────┘ head and tail print(df.head()) print(df.head(2)) print(df.tail()) print(df.tail(2)) shape: (3, 4) ┌──────┬──────┬──────┬─────────┐ │ col1 ┆ col2 ┆ col3 ┆ row_sum │ │ --- ┆ --- ┆ --- ┆ --- │ │ i64 ┆ i64 ┆ i64 ┆ i64 │ ╞══════╪══════╪══════╪═════════╡ │ 1 ┆ 4 ┆ 7 ┆ 12 │ │ 2 ┆ 5 ┆ 8 ┆ 15 │ │ 3 ┆ 6 ┆ 9 ┆ 18 │ └──────┴──────┴──────┴─────────┘ shape: (2, 4) ┌──────┬──────┬──────┬─────────┐ │ col1 ┆ col2 ┆ col3 ┆ row_sum │ │ --- ┆ --- ┆ --- ┆ --- │ │ i64 ┆ i64 ┆ i64 ┆ i64 │ ╞══════╪══════╪══════╪═════════╡ │ 1 ┆ 4 ┆ 7 ┆ 12 │ │ 2 ┆ 5 ┆ 8 ┆ 15 │ └──────┴──────┴──────┴─────────┘ shape: (3, 4) ┌──────┬──────┬──────┬─────────┐ │ col1 ┆ col2 ┆ col3 ┆ row_sum │ │ --- ┆ --- ┆ --- ┆ --- │ │ i64 ┆ i64 ┆ i64 ┆ i64 │ ╞══════╪══════╪══════╪═════════╡ │ 1 ┆ 4 ┆ 7 ┆ 12 │ │ 2 ┆ 5 ┆ 8 ┆ 15 │ │ 3 ┆ 6 ┆ 9 ┆ 18 │ └──────┴──────┴──────┴─────────┘ shape: (2, 4) ┌──────┬──────┬──────┬─────────┐ │ col1 ┆ col2 ┆ col3 ┆ row_sum │ │ --- ┆ --- ┆ --- ┆ --- │ │ i64 ┆ i64 ┆ i64 ┆ i64 │ ╞══════╪══════╪══════╪═════════╡ │ 2 ┆ 5 ┆ 8 ┆ 15 │ │ 3 ┆ 6 ┆ 9 ┆ 18 │ └──────┴──────┴──────┴─────────┘ Matrix-vector multiplication import numpy as np a = pl.DataFrame({ "col1": [5,1,3,2], "col2": [1,1,1,3], "col3": [1,2,1,4] }) b = [1, 2, 3] print(a.to_numpy().dot(b)) [10 9 8 20]

Python

Polars Tutorial

Polars Tutorial2