Ø Ø
Ø Ø Ø
Ø Ø Ø Ø
Ø Ø Ø Ø
Ø • • • •
Ø • •
Ø
Ø Ø • • •
Ø
Ø Ø
Ø Ø Ø Ø
• •
• •
Ø
Ø
• • •
Ø • • •
Ø Ø • •
Ø • •
Ø • •
Ø • •
• • • •
Ø • • •
Ø Ø •
Ø
•
Ø •
Ø
•
Ø •
Ø
Ø
• • •
$ mkvirtualenv pycon-‐emr-‐dev (pycon-‐emr-‐dev)$ pip install awscli (pycon-‐emr-‐dev)$ mkdir ~/.awscli (pycon-‐emr-‐dev)$ cat <<-‐EOF >> ~/.awscli/config [profile development] aws_access_key_id=<development_access_key> aws_secret_access_key=<development_secret_key> region=ap-‐northeast-‐1 EOF (pycon-‐emr-‐dev)$ cat <<-‐EOF >> $VIRTUAL_ENV/bin/activate export AWS_CONFIG_FILE=~/.awscli/config export AWS_DEFAULT_PROFILE=development source aws_zsh_completer.sh EOF
$ aws emr create-‐cluster -‐-‐ami-‐version 3.1.1 \ -‐-‐name 'PyConJP 2014 (AMI 3.1.1 Hive)' \ -‐-‐tags Name=pycon-‐jp-‐emr environment=development \ -‐-‐ec2-‐attributes KeyName=yourkey -‐-‐log-‐uri 's3://yourbucket/jobflow_logs/' \ -‐-‐no-‐auto-‐terminate \ -‐-‐visible-‐to-‐all-‐users \ -‐-‐instance-‐groups file://./normal-‐instance-‐setup.json \ -‐-‐applications file://./app-‐hive.json
[ { "Name": "emr-‐master", "InstanceGroupType": "MASTER", "InstanceCount": 1, "InstanceType": "m1.medium" }, { "Name": "emr-‐core", "InstanceGroupType": "CORE", "InstanceCount": 2, "InstanceType": "m1.medium" } ]
[ { "Name": "HIVE" } ]
{ "ClusterId": "j-‐8xxxxxxxxx" }
$ aws emr add-‐steps -‐-‐cluster-‐id j-‐8xxxxxxxxx \ -‐-‐steps file://./hive-‐sample-‐step-‐1.json
[ { "Args": [ "-‐f", "s3n://yourbucket/hive-‐script/sample01.hql", "-‐d", "BUCKET_NAME=yourbucket", "-‐d", "TARGET_DATE=20140818" ], "ActionOnFailure": "CONTINUE", "Name": "Hive Sample Program 01", "Type": "HIVE" }, { "Args": [ "-‐f", "s3n://yourbucket/hive-‐script/sample02.hql", "-‐d", "BUCKET_NAME=yourbucket", "-‐d", "TARGET_DATE=20140818" ], "ActionOnFailure": "CONTINUE", "Name": "Hive Sample Program 02", "Type": "HIVE" } ]
$ aws emr add-‐steps -‐-‐cluster-‐id j-‐8xxxxxxxxx \ -‐-‐steps file://./s3distcp-‐sample-‐step.json
[ { "Name": "s3distcp Sample", "ActionOnFailure": "CONTINUE", "Jar": "/home/hadoop/lib/emr-‐s3distcp-‐1.0.jar", "Type": "CUSTOM_JAR", "Args": [ "-‐-‐src", "s3n://yourbucket/access_log/dt=20140818", "-‐-‐dest", "s3n://yourbucket/compressed_log/dt=20140818", "-‐-‐groupBy", ".*(nginx_access_log-‐).*", "-‐-‐targetSize", "100", "-‐-‐outputCodec", "gzip" ] } ]
$ aws emr create-‐cluster -‐-‐ami-‐version 3.1.1 \ -‐-‐name 'PyConJP 2014 (AMI 3.1.1 Hive)' \ -‐-‐tags Name=pycon-‐jp-‐emr environment=development \ -‐-‐ec2-‐attributes KeyName=yourkey -‐-‐log-‐uri 's3://yourbucket/jobflow_logs/' \ -‐-‐no-‐auto-‐terminate \ -‐-‐visible-‐to-‐all-‐users \ -‐-‐instance-‐groups file://./normal-‐instance-‐setup.json \ -‐-‐applications file://./app-‐hive-‐with-‐config.json
[ { "Args": [ "-‐-‐hive-‐site=s3://yourbucket/libs/config/hive-‐site.xml" ], "Name": "HIVE" } ]
<?xml version="1.0"?> <?xml-‐stylesheet type="text/xsl" href="configuration.xsl"?> <configuration> <property> <name>hive.optimize.s3.query</name> <value>true</value> <description>Optimize query on S3</description> </property> </configuration>
$ aws emr create-‐cluster -‐-‐ami-‐version 3.1.1 \ -‐-‐name 'PyConJP 2014 (AMI 3.1.1 Hive + Presto)' \ -‐-‐tags Name=pycon-‐jp-‐emr environment=development \ -‐-‐ec2-‐attributes KeyName=yourkey -‐-‐log-‐uri 's3://yourbucket/jobflow_logs/' \ -‐-‐no-‐auto-‐terminate \ -‐-‐visible-‐to-‐all-‐users \ -‐-‐instance-‐groups file://./normal-‐instance-‐setup.json \ -‐-‐bootstrap-‐actions file://./bootstrap-‐presto.json \ -‐-‐applications file://./app-‐hive-‐with-‐config.json
[ { "Name": "Install/Setup Presto", "Path": "s3://yourbucket/libs/setup-‐presto.rb", "Args": [ "-‐-‐task_memory", "1GB", "-‐-‐log-‐level", "DEGUB", "-‐-‐version", "0.75", "-‐-‐presto-‐repo-‐url", "http://central.maven.org/maven2/com/facebook/presto/", "-‐-‐sink-‐buffer-‐size", "1GB", "-‐-‐query-‐max-‐age", "1h", "-‐-‐jvm-‐config", "-‐server -‐Xmx2G -‐XX:+UseConcMarkSweepGC -‐XX:+ExplicitGCInvokesConcurrent -‐XX:+CMSClassUnloadingEnabled -‐XX:+AggressiveOpts -‐XX:+HeapDumpOnOutOfMemoryError -‐XX:OnOutOfMemoryError=kill -‐9 %p -‐XX:PermSize=150M -‐XX:MaxPermSize=150M -‐XX:ReservedCodeCacheSize=150M -‐Dhive.config.resources=/home/hadoop/conf/core-‐site.xml,/home/hadoop/conf/hdfs-‐site.xml" ] } ]
Ø
Ø
Ø
Ø
Ø
Ø Ø
Ø
Ø
<configuration> <property> <name>hive.optimize.s3.query</name> <value>true</value> <description>Optimize query on S3</description> </property> <property> <name>javax.jdo.option.ConnectionURL</name> <value>jdbc:mysql://hostname:3306/hive?createDatabaseIfNotExist=true</value> <description>JDBC connect string for a JDBC metastore</description> </property> <property> <name>javax.jdo.option.ConnectionDriverName</name> <value>com.mysql.jdbc.Driver</value> <description>Driver class name for a JDBC metastore</description> </property> <property> <name>javax.jdo.option.ConnectionUserName</name> <value>username</value> <description>Username to use against metastore database</description> </property> <property> <name>javax.jdo.option.ConnectionPassword</name> <value>password</value> <description>Password to use against metastore database</description> </property> </configuration>
Ø Ø Ø
Ø Ø
# -‐*-‐ coding: utf-‐8 -‐*-‐ from datetime import datetime from boto.emr import connect_to_region from boto.emr.step import InstallHiveStep def setup_emr(): # need to export AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY # as environment variables. conn = connect_to_region('ap-‐northeast-‐1') install_step = InstallHiveStep(hive_versions='0.11.0.2') jobid = conn.run_jobflow( name='Create EMR [{}]'.format(datetime.today().strftime('%Y%m%d')), log_uri='s3://yourbucket/jobflow_logs/', ec2_keyname='your_key', master_instance_type='m1.medium', slave_instance_type='m1.medium', num_instances=3, action_on_failure='TERMINATE_JOB_FLOW', keep_alive=True, enable_debugging=False, hadoop_version='2.4.0', steps=[install_step], bootstrap_actions=[], instance_groups=None, additional_info=None, ami_version='3.1.1', api_params=None, visible_to_all_users=True, job_flow_role=None) return jobid if __name__ == '__main__': jobflow_id = setup_emr() print "JobFlowID: {} started.".format(jobflow_id)
Ø • • •
jobid = conn.run_jobflow( name='Create EMR and Exec hiveql [{}]'.format(target_date), log_uri='s3://{}/jobflow_logs/'.format(bucket_name), ec2_keyname='your_key', master_instance_type='m1.medium', slave_instance_type='m1.medium', num_instances=3, action_on_failure='TERMINATE_JOB_FLOW', keep_alive=True, enable_debugging=False, hadoop_version='2.4.0', steps=[install_step], bootstrap_actions=[], instance_groups=None, additional_info=None, ami_version='3.1.1', api_params=None, visible_to_all_users=True, job_flow_role=None) query_files = ['sample01.hql', 'sample02.hql'] hql_steps = [] for query_file in query_files: hql_step = HiveStep( name='Executing Query [{}]'.format(query_file), hive_file='s3n://{0}/hive-‐script/{1}'.format( bucket_name, query_file), hive_versions=hive_version, hive_args=['-‐dTARGET_DATE={0}'.format(target_date), '-‐dBUCKET_NAME={0}'.format(bucket_name)]) hql_steps.append(hql_step) conn.add_jobflow_steps(jobid, hql_steps)
Ø • •
Ø
• • • • • • • • • •
• • •
• •
Ø •
Ø •
Ø Ø Ø
Ø Ø Ø Ø
Ø Ø
Top Related