THIS IS A TEST INSTANCE. ALL YOUR CHANGES WILL BE LOST!!!!
Wiki Markup |
---|
To make it easier for people to understand the "Query Phrase Popularity" example in the tutorial, I added comment lines to script1-local.pig to show samples from the intermediate relations. Should this be checked in? /* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ -- Query Phrase Popularity (local mode) -- This script processes a search query log file from the Excite search engine and finds search phrases that occur with particular high frequency during certain times of the day. -- Register the tutorial JAR file so that the included UDFs can be called in the script. REGISTER /tmp/tutorial.jar; -- Use the PigStorage function to load the excite log file into the raw bag as an array of records. -- Input: (user,time,query) raw = LOAD 'excite-small.log' USING PigStorage('\t') AS (user, time, query); --BED75271605EBD0C 970916001954 Yahoo chat --BED75271605EBD0C 970916003523 Yahoo chat --2B73EFE0F9FC9E0B 970916195507 http://educationalproducts.com --CD37F95FC0886E1D 970916084059 www:http:/www.tti.com -- Call the NonURLDetector UDF to remove records if the query field is empty or a URL. clean1 = FILTER raw BY org.apache.pig.tutorial.NonURLDetector(query); --BED75271605EBD0C 970916001954 Yahoo chat --BED75271605EBD0C 970916003523 Yahoo chat -- Call the ToLower UDF to change the query field to lowercase. clean2 = FOREACH clean1 GENERATE user, time, org.apache.pig.tutorial.ToLower(query) as query; --BED75271605EBD0C 970916001954 yahoo chat --BED75271605EBD0C 970916003523 yahoo chat -- Because the log file only contains queries for a single day, we are only interested in the hour. -- The excite query log timestamp format is YYMMDDHHMMSS. -- Call the ExtractHour UDF to extract the hour (HH) from the time field. houred = FOREACH clean2 GENERATE user, org.apache.pig.tutorial.ExtractHour(time) as hour, query; --BED75271605EBD0C 00 yahoo chat --BED75271605EBD0C 00 yahoo chat --BED75271605EBD0C 00 yahoo chat --BED75271605EBD0C 00 yahoo chat --BED75271605EBD0C 00 yahoo chat --BED75271605EBD0C 00 yahoo chat -- Call the NGramGenerator UDF to compose the n-grams of the query. ngramed1 = FOREACH houred GENERATE user, hour, flatten(org.apache.pig.tutorial.NGramGenerator(query)) as ngram; --BED75271605EBD0C 00 chat --BED75271605EBD0C 00 yahoo --BED75271605EBD0C 00 yahoo chat --BED75271605EBD0C 00 chat --BED75271605EBD0C 00 yahoo --BED75271605EBD0C 00 yahoo chat -- Use the DISTINCT command to get the unique n-grams for all records. ngramed2 = DISTINCT ngramed1; --BED75271605EBD0C 00 chat --BED75271605EBD0C 00 yahoo --BED75271605EBD0C 00 yahoo chat -- Use the GROUP command to group records by n-gram and hour. hour_frequency1 = GROUP ngramed2 BY (ngram, hour); --(chat,00) {(BED75271605EBD0C,00,chat)} --(yahoo chat,00) {(BED75271605EBD0C,00,yahoo chat)} --(yahoo,00) {(BED75271605EBD0C,00,yahoo)} -- Use the COUNT function to get the count (occurrences) of each n-gram. hour_frequency2 = FOREACH hour_frequency1 GENERATE flatten($0), COUNT($1) as count; --yahoo 00 1 --chat 00 1 --yahoo chat 00 1 -- Use the GROUP command to group records by n-gram only. -- Each group now corresponds to a distinct n-gram and has the count for each hour. uniq_frequency1 = GROUP hour_frequency2 BY group::ngram; --yahoo {(yahoo,04,1),(yahoo,00,1),(yahoo,01,1),(yahoo,02,1),(yahoo,03,1),(yahoo,09,1),(yahoo,10,1),(yahoo,19,1),(yahoo,20,1)} --chat {(chat,00,1),(chat,01,1),(chat,02,1),(chat,03,1),(chat,04,2),(chat,05,1),(chat,06,1),(chat,07,1),(chat,08,1),(chat,09,1),(chat,13,1),(chat,17,3),(chat,19,2),(chat,20,1)} --yahoo chat {(yahoo chat,00,1),(yahoo chat,01,1),(yahoo chat,02,1),(yahoo chat,03,1),(yahoo chat,04,1),(yahoo chat,09,1),(yahoo chat,19,1),(yahoo chat,20,1)} -- For each group, identify the hour in which this n-gram is used with a particularly high frequency. -- Call the ScoreGenerator UDF to calculate a "popularity" score for the n-gram. uniq_frequency2 = FOREACH uniq_frequency1 GENERATE flatten($0), flatten(org.apache.pig.tutorial.ScoreGenerator($1)); --chat 19 1.2126781251816656 2 1.2857142857142854 --chat 04 1.2126781251816656 2 1.2857142857142854 --chat 17 2.9104275004359965 3 1.2857142857142854 --new 07 2.4494897427831788 2 1.1428571428571426 --the 08 1.5895540678349904 4 1.9375 --the 09 0.0481683050859088 2 1.9375 -- Use the FOREACH-GENERATE command to assign names to the fields. uniq_frequency3 = FOREACH uniq_frequency2 GENERATE $1 as hour, $0 as ngram, $2 as score, $3 as count, $4 as mean; --10 the 0.0481683050859088 2 1.9375 --19 chat 1.2126781251816656 2 1.2857142857142854 --04 chat 1.2126781251816656 2 1.2857142857142854 --17 chat 2.9104275004359965 3 1.2857142857142854 --14 city 2.2360679774997902 2 1.1666666666666665 -- Use the FILTER command to move all records with a score less than or equal to 2.0. filtered_uniq_frequency = FILTER uniq_frequency3 BY score > 2.0; --08 s 2.545584412271571 3 1.3636363636363635 --19 in 2.1572774865200244 3 1.4285714285714284 --11 in 2.1572774865200244 3 1.4285714285714284 --10 to 2.6457513110645903 2 1.125 --19 car 2.23606797749979 3 1.3333333333333333 ---... -- Use the ORDER command to sort the remaining records by hour and score. ordered_uniq_frequency = ORDER filtered_uniq_frequency BY hour, score; --07 new 2.4494897427831788 2 1.1428571428571426 --08 pictures 2.04939015319192 3 1.4999999999999998 --08 computer 2.4494897427831788 2 1.1428571428571426 --08 s 2.545584412271571 3 1.3636363636363635 --10 free 2.2657896674010605 4 1.923076923076923 -- Use the PigStorage function to store the results. -- Output: (hour, n-gram, score, count, average_counts_among_all_hours) STORE ordered_uniq_frequency INTO 'script1-local-results.out' USING PigStorage(); |