Caution: The data generation is an expensive operation. A scalefactor of 1 might take 10-20 minutes depending on the machine specifications, and will be using all the available machine resources.
The TPC-DS datasource function is used to generate data with accordance to the specifications of the TPC Benchmark DS. Below are the major properties of the TPC-DS generator:
- The TPC-DS generator is a private function, and needs to be imported to be used.
- The TPC-DS generator is based on the dsdgen TPC-DS tool for generating the data.
- The data is generated in parallel, utilizing all the available partitions on all the available nodes.
- The generated data types are with accordance to the data types in the TPC-DS Schema. Each data type is converted to its respective proper type during the data generation (i.e integers, doubles, strings, ... etc)
- DATE and TIME types are treated as String.
- Two versions of the generator function exist:
- Single parameter version: Takes a single parameter, namely, scalefactor, and generates the data for all the tables for the specified
scalefactor
. - Two parameters versions: Takes two parameters, namely, tablename and scalefactor (in this order), and generates the data for the specified table tablename only, for the specified scalefactor.
- Single parameter version: Takes a single parameter, namely, scalefactor, and generates the data for all the tables for the specified
Scale Factor to Data Size Ratio
The table below shows the relation between the scale factor and the generated data size. Each scale factor translates to approximately 1 GB of data generation.
Scale Factor | Generated Size |
---|---|
1 | 1GB |
2 | 2GB |
10 | 10GB |
1000 | 1TB |
100000 | 100TB |
TPC-DS Specification References
TPC specifications can be found in the following link: http://www.tpc.org/tpc_documents_current_versions/current_specifications.asp
TPC-DS specifications can be found in the following link: http://www.tpc.org/tpc_documents_current_versions/pdf/tpc-ds_v2.11.0.pdf
TPC-DS Generation Query
Below are the query statements that can be used to generate the TPC-DS data, the datasets generated match the table names used in the TPC-DS specifications. The scalefactor variable in the query can be changed to generate the desired data size.
// Create the tpcds dataverse drop dataverse tpcds if exists; create dataverse tpcds; use tpcds; // Create the types for each dataset create type call_center as open {id: uuid}; create type catalog_page as open {id: uuid}; create type catalog_returns as open {id: uuid}; create type catalog_sales as open {id: uuid}; create type customer as open {id: uuid}; create type customer_address as open {id: uuid}; create type customer_demographics as open {id: uuid}; create type date_dim as open {id: uuid}; create type household_demographics as open {id: uuid}; create type income_band as open {id: uuid}; create type inventory as open {id: uuid}; create type item as open {id: uuid}; create type promotion as open {id: uuid}; create type reason as open {id: uuid}; create type ship_mode as open {id: uuid}; create type store as open {id: uuid}; create type store_returns as open {id: uuid}; create type store_sales as open {id: uuid}; create type time_dim as open {id: uuid}; create type warehouse as open {id: uuid}; create type web_page as open {id: uuid}; create type web_returns as open {id: uuid}; create type web_sales as open {id: uuid}; create type web_site as open {id: uuid}; // Create the datasets create dataset call_center(call_center) primary key id autogenerated; create dataset catalog_page(catalog_page) primary key id autogenerated; create dataset catalog_returns(catalog_returns) primary key id autogenerated; create dataset catalog_sales(catalog_sales) primary key id autogenerated; create dataset customer(customer) primary key id autogenerated; create dataset customer_address(customer_address) primary key id autogenerated; create dataset customer_demographics(customer_demographics) primary key id autogenerated; create dataset date_dim(date_dim) primary key id autogenerated; create dataset household_demographics(household_demographics) primary key id autogenerated; create dataset income_band(income_band) primary key id autogenerated; create dataset inventory(inventory) primary key id autogenerated; create dataset item(item) primary key id autogenerated; create dataset promotion(promotion) primary key id autogenerated; create dataset reason(reason) primary key id autogenerated; create dataset ship_mode(ship_mode) primary key id autogenerated; create dataset store(store) primary key id autogenerated; create dataset store_returns(store_returns) primary key id autogenerated; create dataset store_sales(store_sales) primary key id autogenerated; create dataset time_dim(time_dim) primary key id autogenerated; create dataset warehouse(warehouse) primary key id autogenerated; create dataset web_page(web_page) primary key id autogenerated; create dataset web_returns(web_returns) primary key id autogenerated; create dataset web_sales(web_sales) primary key id autogenerated; create dataset web_site(web_site) primary key id autogenerated; // Generate the TPC-DS data for each dataset for the specified scale factor set `import-private-functions` `true`; insert into call_center(select value data from tpcds_datagen("call_center", 1) as data); insert into catalog_page(select value data from tpcds_datagen("catalog_page", 1) as data); insert into catalog_returns(select value data from tpcds_datagen("catalog_returns", 1) as data); insert into catalog_sales(select value data from tpcds_datagen("catalog_sales", 1) as data); insert into customer(select value data from tpcds_datagen("customer", 1) as data); insert into customer_address(select value data from tpcds_datagen("customer_address", 1) as data); insert into customer_demographics(select value data from tpcds_datagen("customer_demographics", 1) as data); insert into date_dim(select value data from tpcds_datagen("date_dim", 1) as data); insert into household_demographics(select value data from tpcds_datagen("household_demographics", 1) as data); insert into income_band(select value data from tpcds_datagen("income_band", 1) as data); insert into inventory(select value data from tpcds_datagen("inventory", 1) as data); insert into item(select value data from tpcds_datagen("item", 1) as data); insert into promotion(select value data from tpcds_datagen("promotion", 1) as data); insert into reason(select value data from tpcds_datagen("reason", 1) as data); insert into ship_mode(select value data from tpcds_datagen("ship_mode", 1) as data); insert into store(select value data from tpcds_datagen("store", 1) as data); insert into store_returns(select value data from tpcds_datagen("store_returns", 1) as data); insert into store_sales(select value data from tpcds_datagen("store_sales", 1) as data); insert into time_dim(select value data from tpcds_datagen("time_dim", 1) as data); insert into warehouse(select value data from tpcds_datagen("warehouse", 1) as data); insert into web_page(select value data from tpcds_datagen("web_page", 1) as data); insert into web_returns(select value data from tpcds_datagen("web_returns", 1) as data); insert into web_sales(select value data from tpcds_datagen("web_sales", 1) as data); insert into web_site(select value data from tpcds_datagen("web_site", 1) as data);