Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use statistics in Faker CTAS #24585

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,11 @@
*/
package io.trino.plugin.faker;

import com.google.common.collect.ImmutableMap;
import io.trino.spi.connector.ColumnMetadata;
import io.trino.spi.type.Type;

import java.util.Map;
import java.util.Optional;

import static java.util.Objects.requireNonNull;
Expand Down Expand Up @@ -57,4 +59,16 @@ public ColumnInfo withComment(Optional<String> comment)
.setComment(comment)
.build());
}

public ColumnInfo withHandle(FakerColumnHandle handle)
{
return new ColumnInfo(handle, metadata);
}

public ColumnInfo withProperties(Map<String, Object> properties)
{
return new ColumnInfo(handle, ColumnMetadata.builderFrom(metadata)
.setProperties(ImmutableMap.copyOf(properties))
.build());
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -174,4 +174,19 @@ private static List<String> strings(Collection<?> values)
.map(String.class::cast)
.collect(toImmutableList());
}

public FakerColumnHandle withNullProbability(double nullProbability)
{
return new FakerColumnHandle(columnIndex, name, type, nullProbability, generator, domain, step);
}

public FakerColumnHandle withDomain(Domain domain)
{
return new FakerColumnHandle(columnIndex, name, type, nullProbability, generator, domain, step);
}

public FakerColumnHandle withStep(ValueSet step)
{
return new FakerColumnHandle(columnIndex, name, type, nullProbability, generator, domain, step);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ public class FakerConfig
private double nullProbability = 0.5;
private long defaultLimit = 1000L;
private Locale locale = Locale.ENGLISH;
private double sequenceMinDistinctValuesRatio = 0.98;
private long maxDictionarySize = 1000L;

@Max(1)
@Min(0)
Expand Down Expand Up @@ -68,4 +70,39 @@ public FakerConfig setLocale(String value)
this.locale = new Locale.Builder().setLanguageTag(value).build();
return this;
}

@Max(2)
@Min(0)
public double getSequenceMinDistinctValuesRatio()
{
return sequenceMinDistinctValuesRatio;
}

@Config("faker.sequence-min-distinct-values-ratio")
@ConfigDescription(
"""
Minimum ratio of distinct values of a column to total number of rows in a table to treat the columns as a sequence
when creating a table using existing data. Set to a value greater than 1 to disable using sequences""")
public FakerConfig setSequenceMinDistinctValuesRatio(double value)
{
this.sequenceMinDistinctValuesRatio = value;
return this;
}

@Min(0)
public long getMaxDictionarySize()
{
return maxDictionarySize;
}

@Config("faker.max-dictionary-size")
@ConfigDescription(
"""
Maximum size of randomly generated dictionaries to pick values from, used for columns with low number of approximate distinct values
observed during table creation using existing data. Set to zero to disable using dictionaries""")
public FakerConfig setMaxDictionarySize(long value)
{
this.maxDictionarySize = value;
return this;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,25 @@ public List<PropertyMetadata<?>> getSchemaProperties()
"Default limit of rows returned from any table in this schema, if not specified in the query",
null,
defaultLimit -> checkProperty(1 <= defaultLimit, INVALID_SCHEMA_PROPERTY, "default_limit value must be equal or greater than 1"),
false),
doubleProperty(
SchemaInfo.SEQUENCE_MIN_DISTINCT_VALUES_RATIO,
"""
Minimum ratio of distinct values of a column to total number of rows in a table to treat the columns as a sequence
when creating a table in this schema using existing data. Set to a value greater than 1 to disable using sequences""",
nineinchnick marked this conversation as resolved.
Show resolved Hide resolved
null,
sequenceMinDistinctValuesRatio -> checkProperty(
0 <= sequenceMinDistinctValuesRatio && sequenceMinDistinctValuesRatio <= 2,
INVALID_SCHEMA_PROPERTY,
SchemaInfo.SEQUENCE_MIN_DISTINCT_VALUES_RATIO + " value must be between 0 and 2, inclusive"),
false),
longProperty(
SchemaInfo.MAX_DICTIONARY_SIZE,
"""
Maximum size of randomly generated dictionaries to pick values from, used for columns with low number of approximate distinct values
observed during table created in this schema using existing data. Set to zero to disable using dictionaries""",
null,
maxDictionarySize -> checkProperty(0 <= maxDictionarySize, INVALID_SCHEMA_PROPERTY, "max_dictionary_size value must be equal or greater than 0"),
false));
}

Expand All @@ -143,6 +162,25 @@ public List<PropertyMetadata<?>> getTableProperties()
"Default limit of rows returned from this table if not specified in the query",
null,
defaultLimit -> checkProperty(1 <= defaultLimit, INVALID_TABLE_PROPERTY, "default_limit value must be equal or greater than 1"),
false),
doubleProperty(
TableInfo.SEQUENCE_MIN_DISTINCT_VALUES_RATIO,
"""
Minimum ratio of distinct values of a column to total number of rows in a table to treat the columns as a sequence
when creating a table using existing data. Set to a value greater than 1 to disable using sequences""",
null,
sequenceMinDistinctValuesRatio -> checkProperty(
0 <= sequenceMinDistinctValuesRatio && sequenceMinDistinctValuesRatio <= 2,
INVALID_TABLE_PROPERTY,
TableInfo.SEQUENCE_MIN_DISTINCT_VALUES_RATIO + " value must be between 0 and 2, inclusive"),
false),
longProperty(
TableInfo.MAX_DICTIONARY_SIZE,
"""
Maximum size of randomly generated dictionaries to pick values from, used for columns with low number of approximate distinct values
observed during table creation using existing data. Set to zero to disable using dictionaries""",
null,
maxDictionarySize -> checkProperty(0 <= maxDictionarySize, INVALID_TABLE_PROPERTY, "max_dictionary_size value must be equal or greater than 0"),
false));
}

Expand Down
Loading
Loading