...
CREATE SCHEDULED QUERY <scheduled_query_name>
<scheduleSpecification>
[<executedAsSpec> ]
[<enableSpecification>]
<definedAsSpec>
Alter Scheduled query syntax
ALTER SCHEDULED QUERY <scheduled_query_name>
(<scheduleSpec>|<executedAsSpec>|<enableSpecification>|<definedAsSpec>);
Drop syntax
DROP SCHEDULED QUERY <scheduled_query_name>;
...
Anchor | ||||
---|---|---|---|---|
|
CRON <quartz_schedule_expression>
where
quartz_schedule_expression is quoted schedule in the Quartz format Anchor quartz_schedule_expression quartz_schedule_expression
...
- hive.scheduled.queries.executor.enabled (default: true)
Controls whether HS2 will run scheduled query executor. - hive.scheduled.queries.namespace (default: "hive")
Sets the scheduled query namespace to be used. New scheduled queries are created in this namespace; and execution is also bound to the namespace - hive.scheduled.queries.executor.idle.sleep.time (default: 1 minute)
Time to sleep between querying for the presence of a scheduled query. - hive.scheduled.queries.executor.progress.report.interval (default: 1 minute)
While scheduled queries are in flight; a background update happens periodically to report the actual state of the query. - hive.scheduled.queries.create.as.enabled (default: true)
This option sets the default behaviour of newly created scheduled queries. - hive.security.authorization.scheduled.queries.supported (default: false)
Enable this if the configured authorizer is able to handle scheduled query related calls.
Examples
Example 1 - basic example of using schedules
Code Block |
---|
create table t (a integer);
-- create a scheduled query; every 10 minute insert a new row
create scheduled query sc1 cron ‘0 */10 * * * ? *’ as insert into t values (1);
-- depending on hive.scheduled.queries.create.as.enabled the query might get create in disabled mode
-- it can be enabled using:
alter scheduled query sc1 enabled;
-- inspect scheduled queries using the information_schema
select * from information_schema.scheduled_queries s where schedule_name='sc1';
+-----------------------+------------------+------------+----------------------+-------------------+---------+-----------+----------------------+
| s.scheduled_query_id | s.schedule_name | s.enabled | s.cluster_namespace | s.schedule | s.user | s.query | s.next_execution |
+-----------------------+------------------+------------+----------------------+-------------------+---------+-----------+----------------------+
| 1 | sc1 | true | hive | 0 */10 * * * ? * | dev | select 1 | 2020-02-03 15:10:00 |
+-----------------------+------------------+------------+----------------------+-------------------+---------+-----------+----------------------+
-- wait 10 minutes
select * from information_schema.scheduled_executions s where schedule_name='sc1' order by scheduled_execution_id desc limit 1;
+---------------------------+------------------+----------------------------------------------------+-----------+----------------------+----------------------+------------+------------------+---------------------+
| s.scheduled_execution_id | s.schedule_name | s.executor_query_id | s.state | s.start_time | s.end_time | s.elapsed | s.error_message | s.last_update_time |
+---------------------------+------------------+----------------------------------------------------+-----------+----------------------+----------------------+------------+------------------+---------------------+
| 496 | sc1 | dev_20200203152025_bdf3deac-0ca6-407f-b122-c637e50f99c8 | FINISHED | 2020-02-03 15:20:23 | 2020-02-03 15:20:31 | 8 | NULL | NULL |
+---------------------------+------------------+----------------------------------------------------+-----------+----------------------+----------------------+------------+------------------+---------------------+
|
Example 2 - analyze external table periodically
Suppose you have an external table - the contents of it is slowly changing...which will eventually lead that Hive will utilize outdated statistics during planning time
Code Block |
---|
-- create external table
create external table ex (a integer);
-- see where the table lives:
desc formatted t;
[...]
| Location: | file:/data/hive/warehouse/ex | NULL |
[...]
-- in a terminal; load some data into the table directory:
seq 1 10 > /data/hive/warehouse/ex/f1
-- back in hive you will see that
select count(1) from ex;
10
-- meanwhile basic stats show that the table has "0" rows
desc formatted ex;
[...]
| | numRows | 0 |
[...]
create scheduled query ex_analyze cron '0 */1 * * * ? *' as analyze table ex compute statistics for columns;
-- wait some time...
select * from information_schema.scheduled_executions s where schedule_name='ex_analyze' order by scheduled_execution_id desc limit 3;
+---------------------------+------------------+----------------------------------------------------+------------+----------------------+----------------------+------------+------------------+----------------------+
| s.scheduled_execution_id | s.schedule_name | s.executor_query_id | s.state | s.start_time | s.end_time | s.elapsed | s.error_message | s.last_update_time |
+---------------------------+------------------+----------------------------------------------------+------------+----------------------+----------------------+------------+------------------+----------------------+
| 498 | ex_analyze | dev_20200203152640_a59bc198-3ed3-4ef2-8f63-573607c9914e | FINISHED | 2020-02-03 15:26:38 | 2020-02-03 15:28:01 | 83 | NULL | NULL |
+---------------------------+------------------+----------------------------------------------------+------------+----------------------+----------------------+------------+------------------+----------------------+
-- and the numrows have been updated
desc formatted ex;
[...]
| | numRows | 10 |
[...]
-- we don't want this running every minute anymore...
alter scheduled query ex_analyze disable; |
Example 3 - materialized view rebuild
Code Block |
---|
-- some settings...they might be there already
set hive.support.concurrency=true;
set hive.txn.manager=org.apache.hadoop.hive.ql.lockmgr.DbTxnManager;
set hive.strict.checks.cartesian.product=false;
set hive.stats.fetch.column.stats=true;
set hive.materializedview.rewriting=true;
-- create some tables
CREATE TABLE emps (
empid INT,
deptno INT,
name VARCHAR(256),
salary FLOAT,
hire_date TIMESTAMP)
STORED AS ORC
TBLPROPERTIES ('transactional'='true');
CREATE TABLE depts (
deptno INT,
deptname VARCHAR(256),
locationid INT)
STORED AS ORC
TBLPROPERTIES ('transactional'='true');
-- load data
insert into emps values (100, 10, 'Bill', 10000, 1000), (200, 20, 'Eric', 8000, 500),
(150, 10, 'Sebastian', 7000, null), (110, 10, 'Theodore', 10000, 250), (120, 10, 'Bill', 10000, 250),
(1330, 10, 'Bill', 10000, '2020-01-02');
insert into depts values (10, 'Sales', 10), (30, 'Marketing', null), (20, 'HR', 20);
insert into emps values (1330, 10, 'Bill', 10000, '2020-01-02');
-- create mv
CREATE MATERIALIZED VIEW mv1 AS
SELECT empid, deptname, hire_date FROM emps
JOIN depts ON (emps.deptno = depts.deptno)
WHERE hire_date >= '2016-01-01 00:00:00';
EXPLAIN
SELECT empid, deptname FROM emps
JOIN depts ON (emps.deptno = depts.deptno)
WHERE hire_date >= '2018-01-01';
-- create a schedule to rebuild mv
create scheduled query d cron '0 */10 * * * ? *' defined as
alter materialized view mv1 rebuild;
-- from this expalin it will be seen that the mv1 is being used
EXPLAIN
SELECT empid, deptname FROM emps
JOIN depts ON (emps.deptno = depts.deptno)
WHERE hire_date >= '2018-01-01';
-- insert a new record
insert into emps values (1330, 10, 'Bill', 10000, '2020-01-02');
-- the source tables are scanned
EXPLAIN
SELECT empid, deptname FROM emps
JOIN depts ON (emps.deptno = depts.deptno)
WHERE hire_date >= '2018-01-01';
-- wait 10 minutes
-- run it again...the view should be rebuilt
EXPLAIN
SELECT empid, deptname FROM emps
JOIN depts ON (emps.deptno = depts.deptno)
WHERE hire_date >= '2018-01-01';
|
create table t (a integer);
...