Versions Compared

Key

  • This line was added.
  • This line was removed.
  • Formatting was changed.

...

Info
titleVersion information

Column statistics are introduced in Hive 0.10.0 by HIVE-1362. This is the design document.

Column statistics auto gather is introduced in Hive 2.3 by HIVE-11160. This is also the design document.

For general information about Hive statistics, see Statistics in Hive. For information about top K statistics, see Column Level Top K Statistics.

...

Please note that table and column aliases are not supported in the analyze statement.

To view column stats :

describe formatted [table_name] [column_name];

Metastore Schema

To persist column level statistics, we propose to add the following new tables,

...

LOW_VALUE RAW,
HIGH_VALUE RAW,
NUM_NULLS BIGINT,
NUM_DISTINCTS BIGINT,

BIT_VECTOR, BLOB,  /* introduced in HIVE-16997 in Hive 3.0.0 */

AVG_COL_LEN DOUBLE,
MAX_COL_LEN BIGINT,
NUM_TRUES BIGINT,
NUM_FALSES BIGINT,
LAST_ANALYZED BIGINT NOT NULL)

...

LOW_VALUE RAW,
HIGH_VALUE RAW,
NUM_NULLS BIGINT,
NUM_DISTINCTS BIGINT,

BIT_VECTOR, BLOB,  /* introduced in HIVE-16997 in Hive 3.0.0 */

AVG_COL_LEN DOUBLE,
MAX_COL_LEN BIGINT,
NUM_TRUES BIGINT,
NUM_FALSES BIGINT,
LAST_ANALYZED BIGINT NOT NULL)

...

struct DoubleColumnStatsData {
1: required double lowValue,
2: required double highValue,
3: required i64 numNulls,
4: required i64 numDVs,

5: optional string bitVectors

}

struct LongColumnStatsData {
1: required i64 lowValue,
2: required i64 highValue,
3: required i64 numNulls,
4: required i64 numDVs,

5: optional string bitVectors
}

struct StringColumnStatsData {
1: required i64 maxColLen,
2: required double avgColLen,
3: required i64 numNulls,
4: required i64 numDVs,

5: optional string bitVectors
}

struct BinaryColumnStatsData {
1: required i64 maxColLen,
2: required double avgColLen,
3: required i64 numNulls
}

struct Decimal {
1: required binary unscaled,
3: required i16 scale
}

struct DecimalColumnStatsData {
1: optional Decimal lowValue,
2: optional Decimal highValue,
3: required i64 numNulls,
4: required i64 numDVs,
5: optional string bitVectors
}

struct Date {
1: required i64 daysSinceEpoch
}

struct DateColumnStatsData {
1: optional Date lowValue,
2: optional Date highValue,
3: required i64 numNulls,
4: required i64 numDVs,
5: optional string bitVectors
}

union ColumnStatisticsData {
1: BooleanColumnStatsData booleanStats,
2: LongColumnStatsData longStats,
3: DoubleColumnStatsData doubleStats,
4: StringColumnStatsData stringStats,
5: BinaryColumnStatsData binaryStats,
6: DecimalColumnStatsData decimalStats,
7: DateColumnStatsData dateStats
}

struct ColumnStatisticsObj {
1: required string colName,
2: required string colType,
3: required ColumnStatisticsData statsData
}

...