Most of these commands are derived with the help of ChatGPT or stackoverflow. However, it is easier to look for a command here rather than searching for the answers to these questions over and over again.
There might be a few Python examples here not directly related to Polars!
.
as missing value¶import polars as pl
# df = pl.read_csv("your_file.csv", null_values=".", has_header = True)
.
with missing value¶df = pl.DataFrame({
"col1": [".", "AB456", "GK789", "."],
"col2": [10, 20, 30, 40],
"col3": [1, 2, 3, 4]
})
df = df.with_columns(pl.col(pl.String).replace({".": None}))
print(df)
df = pl.DataFrame({
"col1": ["GK123", "AB456", "GK789", "CD012"],
"col2": [10, 20, 30, 40],
"col3": [1, 2, 3, 4]
})
print(df.columns)
# Get the last two columns
print(df.columns[-2:])
# Get the first two columns
print(df.columns[:2])
df = pl.DataFrame({
"col1": ["GK123", "AB456", "GK789", "CD012"],
"col2": [10, 20, 30, 40]
})
# Replace values in 'col2' based on the condition that 'col1' starts with 'GK'
df = df.with_columns(
pl.when(pl.col("col1").str.starts_with("GK"))
.then(999) # Replace with your desired value
.otherwise(pl.col("col2")) # Keep original value if condition is not met
.alias("col2")
)
print(df)
df = pl.DataFrame({
"col1": ["GK123", "AB456", "GK789", "CD012"],
"col2": [10, 25, 30, 40]
})
# Replace values in 'col2' based on multiple conditions
df = df.with_columns(
pl.when((pl.col("col1").str.starts_with("GK")) & (pl.col("col2") > 20))
.then(999) # Replace with your desired value
.otherwise(pl.col("col2")) # Keep original value if conditions are not met
.alias("col2")
)
print(df)
df = pl.DataFrame({
"col1": ["GK123", "AB456", "GK789", "CD012"],
"col2": [10, 25, 30, 40]
})
df = df.with_columns(
pl.when(pl.col("col2") < 11).then(20)
.when(pl.col("col2") > 33).then(pl.col("col2") * 2)
.otherwise(pl.col("col2"))
.alias("col2")
)
print(df)
col1_vector = df["col1"].to_list()
print(col1_vector)
df = pl.DataFrame({
"col1": [1, 2, 3],
"col2": [4, 5, 6]
})
# Extract values from 'col1' that are greater than 1
filtered_values = df.filter(pl.col("col1") > 1)["col1"].to_list()
print(filtered_values)
df.shape
# Generate the sequence from 1001 to 1010 and concatenate "DK" with each value
result = [f"DK{1000 + i}" for i in range(1, 11)]
print(result)
result = ["DK" + str(i) for i in range(1001, 1011)]
print(result)
string_list = ["AB123", "CD456", "EF789", "GH012"]
# Remove the first two characters from each string
modified_list = [s[2:] for s in string_list]
print(modified_list)
df = pl.DataFrame({
"col1": ["GK123", "AB456", "GK789", "CD012"],
"col2": [10, 20, 30, 40]
})
# Copy 'col2' to 'col1' where 'col1' starts with 'GK'
df = df.with_columns(
pl.when(pl.col("col1").str.starts_with("GK"))
.then(pl.col("col2")) # Copy 'col2' value to 'col1'
.otherwise(pl.col("col1")) # Keep original 'col1' value if condition is not met
.alias("col1")
)
print(df)
df = pl.DataFrame({
"col1": ["GK123", "AB456", "GK789", "CD012"],
"col2": [10, 20, 30, 40]
})
new_values = ["XY987", "ZW654", "XY987", "ZW654"]
# Copy 'new_values' to 'col1' where 'col1' starts with 'GK'
df = df.with_columns(
pl.when(pl.col("col1").str.starts_with("GK"))
.then(pl.Series(new_values)) # Assign new values with the same shape
.otherwise(pl.col("col1"))
.alias("col1")
)
print(df)
df = df.with_columns(pl.Series("col1", new_values))
print(df)
df = pl.DataFrame({
"col1": ["GK123", "AB456", "GK789", "CD012"],
"col2": [10, 20, 30, 40]
})
df = df.with_columns(col1 = pl.lit(pl.Series(new_values)))
print(df)
rowSums
) of columns "a" and "b" into column "a"¶df = pl.DataFrame({
"a": [1, 2, 3],
"b": [4, 5, 6]
})
# Compute the row-wise sum of columns "a" and "b" and store it in column "a"
df = df.with_columns(
(pl.col("a") + pl.col("b")).alias("a")
)
print(df)
df = pl.DataFrame({
"col1": [1, 2, 3],
"col2_right": [4, 5, 6],
"col3": [7, 8, 9],
"col4_right": [10, 11, 12]
})
# Drop columns that end with "_right"
df = df.select([col for col in df.columns if not col.endswith("_right")])
print(df)
df = pl.DataFrame({
"col1": [1, 2, 3],
"col2_right": [4, 5, 6],
"col3": [7, 8, 9],
"col4_right": [10, 11, 12]
})
# Rename columns ending with "_right" to "d*" (d1, d2, ...)
new_names = {}
counter = 1
for col in df.columns:
if col.endswith("_right"):
new_names[col] = f"d{counter}"
counter += 1
# Apply the renaming
df = df.rename(new_names)
print(df)
df = pl.DataFrame({
"col1": [1, 2, 3],
"col2_right": [4, 5, 6],
"col3": [7, 8, 9],
"col4_right": [10, 11, 12]
})
# Create a dictionary to hold the renaming mappings
new_names = {}
# Iterate through the columns and rename those ending with "_right"
for col in df.columns:
if col.endswith("_right"):
# Extract the part before "_right" and prepend "d"
base_name = col.split("_right")[0]
new_names[col] = f"d{base_name}"
# Apply the renaming
df = df.rename(new_names)
print(df)
df = pl.DataFrame({
"original1": ["123-456", "789-012", "345-678"],
"original2": [1, 2, 3],
"original3": [10, 20, 30]
})
# Get the first two column names
old_names = df.columns[:2]
# New names for the first two columns
new_names = ["col1", "col2"]
# Rename the first two columns
df = df.rename({old_names[0]: new_names[0], old_names[1]: new_names[1]})
print(df)
df = pl.DataFrame({
"old_name1": [1, 2, 3],
"old_name2": [4, 5, 6],
"old_name3": [7, 8, 9]
})
# List of new column names
new_column_names = ["new_name1", "new_name2", "new_name3"]
# Rename all columns
df = df.rename({old: new for old, new in zip(df.columns, new_column_names)})
print(df)
.str.replace
and .str.replace_all
¶df = pl.DataFrame({
"col1": ["123-456-34", "789-012-78", "345-678-02"],
"col2": [1, 2, 3]
})
# Replace "-" with "" in the "col1" column
df = df.with_columns(
pl.col("col1").str.replace("-", "").alias("col1")
)
print(df)
df = pl.DataFrame({
"col1": ["123-456", "789-012", "345-678"],
"col2": [1, 2, 3]
})
# Replace "-" with "" in the "col1" column
df = df.with_columns(
pl.col("col1").str.replace_all("-", "").alias("col1")
)
print(df)
# df = pl.read_csv("data.csv", columns=[0, 2])
df = pl.DataFrame({
"col1": [1, 2, 3],
"col2": [None, 5, None],
"col3": [7, None, 9]
})
# Replace null with 0 in the last two columns
df = df.with_columns(
[pl.col(df.columns[-2:]).fill_null(0)]
)
print(df)
df = pl.DataFrame({
"col1": ["ab123", "ab456", "ab789"],
"col2": ["cd001", "cd002", "cd003"]
})
# Drop the first two characters from all elements in the DataFrame
df = df.with_columns(
[pl.col(c).str.slice(2) for c in df.columns]
)
print(df)
# Now, do the same, but turn the resulting elements into integers.
df = pl.DataFrame({
"col1": ["ab123", "ab456", "ab789"],
"col2": ["cd001", "cd002", "cd003"]
})
# Drop the first two characters and convert the remaining strings to integers
df = df.with_columns(
[pl.col(c).str.slice(2).cast(pl.Int64) for c in df.columns]
)
print(df)
df = pl.DataFrame({
"col1": [1, 2, 3],
"col2": [4, 5, 6],
"col3": [7, 8, 9]
})
# Drop the first column by its index (0)
df = df.drop(df.columns[0])
print(df)
df.select(['col3', 'col2'])
df.select([pl.col('col3'), pl.col('col2')])
df = pl.DataFrame({
"col1": ["b", "c", "a", "e", "d"],
"col2": [1, 2, 3, 4, 5]
})
# The desired order for "col1"
desired_order = ["a", "b", "c", "d", "e"]
# Reorder the DataFrame by matching "col1" with the desired_order list
df_reordered = df.with_columns(
pl.col("col1").map_elements(lambda x: desired_order.index(x), return_dtype=pl.Int64).alias("sort_key")
).sort("sort_key").drop("sort_key")
print(df_reordered)
# or
df_reordered = df.join(pl.DataFrame({"col1": desired_order}), on = "col1")
print(df_reordered)
df = pl.DataFrame({
"col1": ["abc123", "def456", "ghi789", "abcxyz"],
"col2": [1, 2, 3, 4]
})
# Filter rows where "col1" contains the pattern "abc"
filtered_df = df.filter(pl.col("col1").str.contains("abc"))
print(filtered_df)
s = pl.Series("a", [1, 2, 3])
s2 = pl.Series("b", [4.0, 5.0, 6.0])
s.dot(s2)
print(df.rows())
rowSums
of a DataFrame¶df = pl.DataFrame({
"col1": [1, 2, 3],
"col2": [4, 5, 6],
"col3": [7, 8, 9]
})
df = df.with_columns(df.select(pl.sum_horizontal("*").alias("row_sum")))
df.with_columns(df.select(pl.sum_horizontal(["col1","col2"]).alias("row_sum")))
print(df)
print(df.head())
print(df.head(2))
print(df.tail())
print(df.tail(2))
import numpy as np
a = pl.DataFrame({
"col1": [5,1,3,2],
"col2": [1,1,1,3],
"col3": [1,2,1,4]
})
b = [1, 2, 3]
print(a.to_numpy().dot(b))