THIS IS A TEST INSTANCE. ALL YOUR CHANGES WILL BE LOST!!!!
...
Code Block | ||
---|---|---|
| ||
# cache the sample so that it doesn't have to read from file system in later job. sample = env.read_text_file(...).cache() # Print out some records for sanity check, sample is cached after the job is finished. print(sample.execute_and_collect(limit=10)) # Read the cached sample, run the expensive data processing and cache the result. preprocessed = sample.flat_map(...).cache() # Check if the preprocessing produceproduces the correct result, the preprocessed result is cached after the job is finished. print(preprocessed.execute_and_collect(limit=10)) # Explore the distribution of positive and negetive sample, using the cached preprocessed result. sample_cnt = preprocessed.count() posivive_cnt = preprocessed.filter(...).count() # Explore dataset with the cached preprocessed result. preprocessed.keyBy(...) .reduce(...) .execute_and_collect() # Close the StreamExecutionEnvironment env.close() |
...