-
Notifications
You must be signed in to change notification settings - Fork 1
/
56-StepFunctions EMR and Glue.txt
73 lines (63 loc) · 2.22 KB
/
56-StepFunctions EMR and Glue.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
# In order for this lab to work correctly, you should run the EMR Applications lab first.
# The EMR Applications lab adds the NYC Taxi metadata to your Glue Data Catalog so you'll see the data when
# you use the Hue UI to run the queries.
# Also, make sure to run both the EMR Applications lab and this lab in the same AWS region.
# Link to Configure proxy settings to view websites hosted on the Master Node:
https://docs.aws.amazon.com/emr/latest/ManagementGuide/emr-connect-master-node-proxy.html
# Link to Set up an SSH Tunnel to the Master Node using dynamic port forwarding
https://docs.aws.amazon.com/emr/latest/ManagementGuide/emr-ssh-tunnel.html
# Generate EMR default roles
aws emr create-default-roles
# SSH Tunnel
ssh -i ~/EMRSpark_keypair.pem -N -D 8157 [email protected]
# Input to state machine
{
"CreateCluster": true,
"TerminateCluster": false,
"EC2Deatils": {
"KeyPair": "EMRSpark_keypair"
},
"JobDetails": {
"GlueJobName": "taxi_parquet"
}
}
# Hue Queries of the Taxi data
# Get average tip per day of the week
select day_of_week (pickup_date) as day_of_week, avg(tip_amount) as average_tip from taxi_blog.parquet_taxi_data group by 1 order by 1
# Show schema
select * from taxi_blog.parquet_taxi_data limit 10
# Compute the number of trips that gave tips in the 10 percent, 15 percent, or higher percentage range
SELECT TipPrctCtgry
, COUNT (DISTINCT TripID) TripCt
FROM
(SELECT TripID
, (CASE
WHEN fare_prct < 0.7 THEN 'FL70'
WHEN fare_prct < 0.8 THEN 'FL80'
WHEN fare_prct < 0.9 THEN 'FL90'
ELSE 'FL100'
END) FarePrctCtgry
, (CASE
WHEN tip_prct < 0.1 THEN 'TL10'
WHEN tip_prct < 0.15 THEN 'TL15'
WHEN tip_prct < 0.2 THEN 'TL20'
ELSE 'TG20'
END) TipPrctCtgry
FROM
(SELECT TripID
, (fare_amount / total_amount) as fare_prct
, (extra / total_amount) as extra_prct
, (mta_tax / total_amount) as tip_prct
, (tolls_amount / total_amount) as mta_taxprct
, (tip_amount / total_amount) as tolls_prct
, (improvement_surcharge / total_amount) as imprv_suchrgprct
, total_amount
FROM
(SELECT *
, (cast(pickup_longitude AS VARCHAR(100)) || '_' || cast(pickup_latitude AS VARCHAR(100))) as TripID
from taxi_blog.parquet_taxi_data
WHERE total_amount > 0
) as t
) as t
) ct
GROUP BY TipPrctCtgry