diff --git a/aws/emr/bootstrap/MyAWSCredentialsProviderWithUri.jar b/aws/emr/bootstrap/MyAWSCredentialsProviderWithUri.jar new file mode 100644 index 0000000..84ad09d Binary files /dev/null and b/aws/emr/bootstrap/MyAWSCredentialsProviderWithUri.jar differ diff --git a/aws/emr/bootstrap/ba_test.sh b/aws/emr/bootstrap/ba_test.sh new file mode 100644 index 0000000..049dca2 --- /dev/null +++ b/aws/emr/bootstrap/ba_test.sh @@ -0,0 +1,154 @@ +#!/bin/bash + +set -x + +install_jupyter=false + +build_vim() { + cd /tmp + git clone http://luajit.org/git/luajit-2.0.git + cd luajit-2.0 + make + sudo make install + + cd /tmp + git clone https://github.com/vim/vim.git + cd vim + ./configure \ + --with-features=huge \ + --enable-cscope \ + --enable-pythoninterp \ + --enable-luainterp \ + --enable-multibyte \ + --enable-fontset \ + --disable-gui \ + --without-x \ + --disable-netbeans \ + --enable-largefile + make + sudo make install + + if [ -e /usr/bin/vi ]; then + sudo rm /usr/bin/vi + fi + sudo ln -s /usr/local/bin/vim /usr/bin/vi + rm -rf /tmp/vim +} + +provision_packages() { + sudo yum groupinstall -y "Development Tools" + sudo yum install -y \ + tmux \ + wget \ + htop \ + mlocate \ + git \ + rake \ + zsh \ + jq \ + at \ + bind-utils \ + strace \ + lua \ + lua-devel \ + ncurses \ + ncurses-devel \ + gmp \ + gmp-devel \ + ctags \ + tcl-devel \ + perl \ + perl-devel \ + perl-ExtUtils-ParseXS \ + perl-ExtUtils-CBuilder \ + perl-ExtUtils-Embed + wget https://bootstrap.pypa.io/get-pip.py + sudo python2.7 ./get-pip.py + sudo env "PATH=$PATH" pip install awscli + cd ~ + wget https://raw.githubusercontent.com/o0beaner/dotfiles/master/install.sh + chmod +x install.sh + ./install.sh + sudo chmod 644 /usr/bin/chsh + sudo chmod +x /usr/bin/chsh + sudo /usr/bin/chsh -s /bin/zsh $USER + sudo updatedb + cd $util_path + wget --no-check-certificate $s3_utils/suntracker.sh + chmod +x $util_path/suntracker.sh + (crontab -l ; echo "0 3 * * * $util_path/suntracker.sh") | crontab - + $util_path/suntracker.sh + touch ~/.zsh.prompts + mkdir ~/.zsh.after/ + echo "prompt agnoster" > ~/.zsh.after/prompt.zsh +} + +install_ssm() { + cd /tmp + sudo yum install -y https://s3.amazonaws.com/ec2-downloads-windows/SSMAgent/latest/linux_amd64/amazon-ssm-agent.rpm +} + +s3ify_zeppelin() { + cd $util_path + wget --no-check-certificate $s3_utils/configure_zeppelin_s3.sh + chmod +x $util_path/configure_zeppelin_s3.sh + aws emr add-steps --cluster-id $cluster_id --steps Type=CUSTOM_JAR,Name="Configure Zeppelin for S3",Jar="command-runner.jar",Args=[$util_path/configure_zeppelin_s3.sh] +} + +install_jupyter() { + cd $util_path + wget --no-check-certificate https://s3.amazonaws.com/aws-bigdata-blog/artifacts/aws-blog-emr-jupyter/install-jupyter-emr5.sh + chmod +x $util_path/install-jupyter-emr5.sh + $util_path/install-jupyter-emr5.sh \ + --r \ + --julia \ + --toree \ + --torch \ + --ruby \ + --ds-packages \ + --ml-packages \ + --python-packages ggplot nilearn \ + --port 8002 \ + --password jupyter \ + --jupyterhub \ + --jupyterhub-port 8001 \ + --cached-install \ + --notebook-dir s3://ty-emr/XRR/jupyter/notebooks/ \ + --copy-samples \ + --s3fs +} + +# get input parameters +while [ $# -gt 0 ]; do + case "$1" in + --jupyter) + install_jupyter=true + ;; + -*) + error_msg "unrecognized option: $1" + ;; + *) + break; + ;; + esac + shift +done + +s3_utils='https://s3.amazonaws.com/ty-emr/XRR/utils' +build_vim=false +util_path="~/.utils" + +mkdir -p $util_path + +provision_packages & +build_vim & +install_ssm & + +is_master=false +if grep isMaster /mnt/var/lib/info/instance.json | grep true; +then + s3ify_zeppelin & + if [ "$install_jupyter" == true ]; then + install_jupyter & + fi +fi diff --git a/aws/emr/bootstrap/boot_strappy_486.sh b/aws/emr/bootstrap/boot_strappy_486.sh new file mode 100644 index 0000000..492fff8 --- /dev/null +++ b/aws/emr/bootstrap/boot_strappy_486.sh @@ -0,0 +1,274 @@ +#!/bin/bash + +logfile=bootstrap.txt +exec > $logfile 2>&1 + +set -x + +install_jupyter=false + +# get input parameters +while [ $# -gt 0 ]; do + case "$1" in + --jupyter) + install_jupyter=true + ;; + -*) + error_msg "unrecognized option: $1" + ;; + *) + break; + ;; + esac + shift +done + +platform=`uname` +s3_utils='https://s3.amazonaws.com/ty-emr/XRR/utils' +build_vim=false + +if [ "$USER" == "root" ]; then + util_path="/root/.utils" + home="/root" +else + if [ "$platform" == "Darwin" ]; then + users_dir="Users" + fi + if [ "$platform" == "Linux" ]; then + users_dir="home" + fi + util_path="/$users_dir/$USER/.utils" +fi +vim_check=`vim --version` +if [[ $vim_check != *"+lua"* ]]; then + build_vim=true +fi +mkdir -p $util_path + +release=`cat /etc/*release* | tr '[:upper:]' '[:lower:]'` +if [[ $release != *"smartos"* ]]; then + if [[ $release == *"rhel fedora"* ]]; then + echo "Looks like we're running on something that is kinda like RHEL..." + sudo yum groupinstall -y "Development Tools" + sudo yum install -y \ + tmux \ + wget \ + htop \ + mlocate \ + git \ + rake \ + zsh \ + jq \ + at \ + bind-utils \ + strace \ + lua \ + lua-devel \ + ncurses \ + ncurses-devel \ + gmp \ + gmp-devel \ + ctags \ + tcl-devel \ + perl \ + perl-devel \ + perl-ExtUtils-ParseXS \ + perl-ExtUtils-CBuilder \ + perl-ExtUtils-Embed + + if [[ $build_vim == true ]]; then + cd /tmp + git clone http://luajit.org/git/luajit-2.0.git + cd luajit-2.0 + make + sudo make install + + cd /tmp + git clone https://github.com/vim/vim.git + cd vim + ./configure \ + --with-features=huge \ + --enable-cscope \ + --enable-pythoninterp \ + --enable-luainterp \ + --enable-multibyte \ + --enable-fontset \ + --disable-gui \ + --without-x \ + --disable-netbeans \ + --enable-largefile + make + sudo make install + + if [ -e /usr/bin/vi ]; then + sudo rm /usr/bin/vi + fi + sudo ln -s /usr/local/bin/vim /usr/bin/vi + rm -rf /tmp/vim + fi + fi + if [[ $release == *"debian"* ]]; then + echo "Looks like we're running on a Debian based system!" + sudo apt-get update + sudo apt-get install -y \ + tmux \ + htop \ + wget \ + mlocate \ + git \ + rake \ + zsh \ + jq \ + at \ + dnsutils \ + strace \ + libncurses5-dev \ + libncursesw5-dev \ + python-dev \ + ruby-dev \ + lua5.1 \ + lua5.1-dev \ + luajit \ + libluajit-5.1 \ + libperl-dev \ + build-essential + + if [[ $build_vim == true ]]; then + sudo ln -sf /usr/include/lua5.1 /usr/include/lua5.1/include + sudo ln -sf /usr/lib/x86_64-linux-gnu/liblua5.1.so /usr/local/lib/liblua.so + cd /tmp + git clone https://github.com/vim/vim.git + cd vim + ./configure \ + --with-features=huge \ + --enable-cscope \ + --enable-pythoninterp=yes \ + --enable-rubyinterp=yes \ + --with-python-config-dir=/usr/lib/python2.7/config-x86_64-linux-gnu \ + --enable-multibyte \ + --enable-fontset \ + --disable-gui \ + --disable-netbeans \ + --enable-luainterp=yes \ + --with-luajit \ + --with-lua-prefix=/usr/include/lua5.1 \ + --enable-largefile + + make + sudo make install + + if [ -e /usr/bin/vi ]; then + sudo rm /usr/bin/vi + fi + sudo ln -s /usr/local/bin/vim /usr/bin/vi + rm -rf /tmp/vim + fi + fi + if [[ $release == *"Arch Linux"* ]]; then + echo "Looks like we're running on Arch!" + yaourt -S --noconfirm \ + gnu-netcat \ + cron \ + tmux \ + htop \ + wget \ + mlocate \ + git \ + rake \ + zsh \ + jq \ + at \ + vim\ + bind-tools \ + strace \ + ncurses \ + ctags + fi + + wget https://bootstrap.pypa.io/get-pip.py + sudo python2.7 ./get-pip.py + sudo env "PATH=$PATH" pip install awscli + su -c "`curl -fksSL https://raw.githubusercontent.com/o0beaner/dotfiles/master/install.sh`" $USER + sudo chmod 644 /usr/bin/chsh + sudo chmod +x /usr/bin/chsh + sudo /usr/bin/chsh -s /bin/zsh $USER + sudo updatedb + cd $util_path + wget --no-check-certificate $s3_utils/suntracker.sh + chmod +x $util_path/suntracker.sh + (crontab -l ; echo "0 3 * * * $util_path/suntracker.sh") | crontab - + $util_path/suntracker.sh +else + BOOTSTRAP_TAR="bootstrap-2017Q1-x86_64.tar.gz" + curl -Ok https://pkgsrc.joyent.com/packages/SmartOS/bootstrap/${BOOTSTRAP_TAR} + tar -zxpf ${BOOTSTRAP_TAR} -C / + rm -f boots* + PATH=/opt/local/sbin:/opt/local/bin:$PATH + MANPATH=/opt/local/man:$MANPATH + pkgin -y in jq tmux git ruby22-rake zsh at || true + mkdir /usbkey/root + mv /root/.[!.]* /usbkey/root + cd / + rm -rf /root + ln -s /usbkey/root /root + su -c "`curl -fksSL https://raw.githubusercontent.com/o0beaner/dotfiles/master/install.sh`" $USER + echo 'if [ -n "$BASH_EXECUTION_STRING" ]; then' >> ~/.bashrc + echo ' export SHELL=/opt/local/bin/zsh' >> ~/.bashrc + echo ' exec "$SHELL" -c "$BASH_EXECUTION_STRING"' >> ~/.bashrc + echo 'fi' >> ~/.bashrc + echo 'SHELL=/opt/local/bin/zsh; exec "$SHELL"' >> ~/.bashrc +fi + +# AWS Instance customization +if [ -e /usr/bin/cloud-init ]; then + + # Install SSM Agent + cd /tmp + sudo yum install -y https://s3.amazonaws.com/ec2-downloads-windows/SSMAgent/latest/linux_amd64/amazon-ssm-agent.rpm + + # Am I running EMR? + instance=`aws ec2 describe-instances --instance-ids $(curl -s 169.254.169.254/latest/meta-data/instance-id)` + tags=`echo $instance | jq -r '.Reservations[0].Instances[0].Tags[]'` + cluster_id=`echo $tags | jq -r '. | select(.Key=="aws:elasticmapreduce:job-flow-id") | .Value'` + if [ -n "$cluster_id" ]; then + echo "$cluster_id" > ~/.cluster_id + role=`echo $tags | jq -r '. | select(.Key=="aws:elasticmapreduce:instance-group-role") | .Value'` + + if [ "$role" == "MASTER" ]; then + # ToDo: Incorporate Hue? + + cd $util_path + wget --no-check-certificate $s3_utils/configure_zeppelin_s3.sh + chmod +x $util_path/configure_zeppelin_s3.sh + aws emr add-steps --cluster-id $cluster_id --steps Type=CUSTOM_JAR,Name="Configure Zeppelin for S3",Jar="command-runner.jar",Args=[$util_path/configure_zeppelin_s3.sh] + fi + + # install jupyter + if [ $install_jupyter == true ]; then + cd $util_path + wget --no-check-certificate https://s3.amazonaws.com/aws-bigdata-blog/artifacts/aws-blog-emr-jupyter/install-jupyter-emr5.sh + chmod +x $util_path/install-jupyter-emr5.sh + $util_path/install-jupyter-emr5.sh \ + --r \ + --julia \ + --toree \ + --torch \ + --ruby \ + --ds-packages \ + --ml-packages \ + --python-packages ggplot nilearn \ + --port 8002 \ + --password jupyter \ + --jupyterhub \ + --jupyterhub-port 8001 \ + --cached-install \ + --notebook-dir s3://ty-emr/XRR/jupyter/notebooks/ \ + --copy-samples \ + --s3fs + fi + fi +fi + +touch ~/.zsh.prompts +mkdir ~/.zsh.after/ +echo "prompt agnoster" > ~/.zsh.after/prompt.zsh diff --git a/aws/emr/bootstrap/boot_strappy_short.sh b/aws/emr/bootstrap/boot_strappy_short.sh new file mode 100644 index 0000000..7df765d --- /dev/null +++ b/aws/emr/bootstrap/boot_strappy_short.sh @@ -0,0 +1,22 @@ +#!/bin/bash + + +set -x +logfile=test.txt + +exec > $logfile 2>&1 +sudo apt-get update +sudo apt-get install -y \ + tmux \ + htop \ + wget \ + git \ + rake \ + zsh \ + +rm -rf .yadr/ + +HOME=/root +wget https://raw.githubusercontent.com/o0beaner/dotfiles/master/install.sh +chmod +x ./install.sh +su -c "`curl -fksSL https://raw.githubusercontent.com/o0beaner/dotfiles/master/install.sh`" $USER diff --git a/aws/emr/bootstrap/certs.zip b/aws/emr/bootstrap/certs.zip new file mode 100644 index 0000000..79ccb69 Binary files /dev/null and b/aws/emr/bootstrap/certs.zip differ diff --git a/aws/emr/bootstrap/configure-hbase-daemons b/aws/emr/bootstrap/configure-hbase-daemons new file mode 100644 index 0000000..695831c --- /dev/null +++ b/aws/emr/bootstrap/configure-hbase-daemons @@ -0,0 +1,70 @@ +#!/bin/bash + +set -e + +# first validate the arguments +REPLACE_FILE=false +for i in "$@" ; do + case $i in + --*-opts*) + if ! echo $i | grep -E -- '--[a-zA-Z-]+?-opts=.+' > /dev/null 2>&1 ; then + echo "Couldn't parse option $i expected --cmd-opts=-XX:+UseG1GC where cmd is hadoop-master or some such and -XX:+UseG1GC is the option to pass to the JVM" 1>&2 + exit 1 + fi + ;; + --help) + set +x + echo "Usage: " + echo "---opts" + echo " Set additional Java options for the specified daemon." + echo " " + echo "--replace" + echo " Replace the existing hbase-user-env.sh file if it exists." + echo " " + echo " is one of:" + echo " hbase-master, hbase-regionserver, zookeeper" + echo " " + echo " " + echo "Example Usage:" + echo " --hbase-master-opts=-Xmx2048 --zookeeper-opts=-XX:GCTimeRatio=19" + exit 1 + ;; + --replace) + REPLACE_FILE=true + ;; + *) + echo "Unknown option $i" 1>&2 + exit 1 + ;; + esac +done + +set -x +mkdir /home/hadoop/conf +HBASE_ENV_FILE=/home/hadoop/conf/hbase-user-env.sh + +if [ -d "/home/hadoop/hbase/conf" ] ; then + HBASE_ENV_FILE=/home/hadoop/hbase/conf/hbase-user-env.sh +fi + +if [ $REPLACE_FILE == "true" ] ; then + rm -rf $HBASE_ENV_FILE +fi + +if [ -e $HBASE_ENV_FILE ] ; then + [[ ! -n $(grep "#\\!/bin/bash" $HBASE_ENV_FILE ) ]] && echo "#!/bin/bash" >> $HBASE_ENV_FILE +else + echo "#!/bin/bash" >> $HBASE_ENV_FILE +fi + +for i in "$@" ; do + case $i in + --*-opts*) + OPTS_CMD=$(echo $i | sed -r 's|--(.*?)-opts=.*|\1|' | tr 'a-z-' 'A-Z_')_OPTS + OPTS_VALUE=$(echo $i | sed -r 's|--.*?-opts=(.*)|\1|') + cat >> $HBASE_ENV_FILE < "$tmpfile" << 'EOF' + #!/bin/bash + # + # Set up DNS for EMR master/slave instance in VPC. + # This script also set up DNS in us-east-1 for non-VPC to handle ec2 instances, + # whose host name begin with domU, with invalid dns domain name (TT0055043598). + # + set -e + set -x + + alias curl="curl --connect-timeout 2 -q -f --retry-delay 2 --retry 5" + + resolv_conf="/etc/resolv.conf" + dhclient_conf="/etc/dhcp/dhclient.conf" + localhost="127.0.0.1" + metadata="http://169.254.169.254/latest/meta-data" + + restart_network="false" + in_vpc="false" + + mac_address="$(curl $metadata/mac/ | tr '[:upper:]' '[:lower:]')" + region="$(curl http://169.254.169.254/latest/dynamic/instance-identity/document | jq -r .region)" + + # wait for the network to come up before proceeding + if [ -e /usr/bin/nm-online ]; then + /usr/bin/nm-online + fi + + get_default_domain() + { + if [ "$region" = "us-east-1" ]; then + echo 'ec2.internal' + else + echo "$region.compute.internal" + fi + } + + get_first_nameserver_from_resolv_conf() + { + awk '$1 ~ /^nameserver/ { print $2 }' "$resolv_conf" + } + + check_vpc() + { + if "$(curl $metadata/network/interfaces/macs/$mac_address/)" | grep -q vpc; then + in_vpc="true" + fi + } + + get_vpc_cidrs() + { + cidrs=$(curl $metadata/network/interfaces/macs/$mac_address/vpc-ipv4-cidr-blocks) + echo "$cidrs" + } + + append_line_to_dhclient_conf() + { + echo "$1" | tee -a "$dhclient_conf" + } + + prepend_domain() + { + #sample line : prepend domain-name "ec2.internal "; + if grep -Eq "^prepend domain-name \"$1[:space:]+\";$" "$dhclient_conf"; then + return + else + append_line_to_dhclient_conf "prepend domain-name \"$1 \";" + restart_network="true" + fi + } + + prepend_domain_server() + { + #sample line : prepend domain-name-servers 127.0.0.1; + if grep -Eq "^prepend domain-name-servers $1;$" "$dhclient_conf"; then + return + fi + append_line_to_dhclient_conf "prepend domain-name-servers $1;" + restart_network="true" + } + + run_dnsmasq() + { + all_domains="$(grep ^search $resolv_conf | cut -d' ' -f2- )" + pid=$(ps -ef | grep dnsmasq | grep synth-domain | awk '{print $2}') + if [ $pid ]; then + kill $pid + fi + for d in $all_domains; do + for c in $(get_vpc_cidrs); do + syn_domains="$syn_domains --synth-domain=$d,$c,ip- " + done + done + runmasq="dnsmasq --listen-address=127.0.0.1 $syn_domains " + eval "$runmasq" + echo "started dnsmasq : $runmasq" + } + + get_host_name() + { + echo "$(hostname -f)" + } + + show_dns_status() + { + type="$1" + echo "------------ $type $resolv_conf ------------" + cat "$resolv_conf" + echo "------------ $type $dhclient_conf ------------" + cat "$dhclient_conf" + hostname="$(get_host_name)" + status="$?" + "'hostname -f' returns : $hostname" + return $status + } + + restart_network_if_needed() + { + if "$restart_network"; then + echo "Updating DNS settings." + service network restart + restart_network="false" + fi + } + + main() + { + show_dns_status "BeforeSetup" + + old_domain="$(grep search $resolv_conf | cut -d' ' -f2-)" + default_domain="$(get_default_domain)" + + check_vpc + + if [ "$in_vpc" = "false" ]; then + # NON-VPC + if [ "$region" = "us-east-1" ]; then + if [[ "$old_domain" == "${default_domain}"* ]]; then + echo "$default_domain is already used in us-east-1." + else + echo "Making sure $default_domain is used in us-east-1." + prepend_domain $default_domain + fi + else + echo "Not in VPC, do nothing and exit." + fi + else + # VPC + first_nameserver="$(get_first_nameserver_from_resolv_conf)" + resolving_host_name="$(get_host_name)" + if [ "$1" = "rundnsmasq" -o -z "$resolving_host_name" ]; then + echo "Run dnsmasq" + run_dnsmasq + if [ "$first_nameserver" != "$localhost" ]; then + prepend_domain_server "$localhost" + fi + else + echo "Resolving hostname(${resolving_host_name}) successfully, do nothing and exit." + fi + fi + + restart_network_if_needed + return show_dns_status "AfterSetup" + } + + main "$@" + exit "$?" +EOF + + sudo mv $tmpfile /usr/bin/setup-dns +} + +if [ ! -f /tmp/dns_flag ]; then + resolv_conf="/etc/resolv.conf" + metadata="http://169.254.169.254/latest/meta-data" + mac_address=`curl -s $metadata/mac` + run_dnsmasq + rewrite_setup_dns + touch /tmp/dns_flag + pid="$(/bin/ps axwwo pid,cmd | awk '$12 ~ /aws157.instancecontroller.Main/ { print $1 }')" + sudo kill "$pid" +fi diff --git a/aws/emr/bootstrap/downloadjars.sh b/aws/emr/bootstrap/downloadjars.sh new file mode 100644 index 0000000..49eaa91 --- /dev/null +++ b/aws/emr/bootstrap/downloadjars.sh @@ -0,0 +1,4 @@ +#!/bin/sh + +sudo aws s3 cp s3://ty-emr-pdx/job_input/seagate/lib/atlas-hive-udfs.jar /usr/lib/hive/lib/ +sudo aws s3 cp s3://ty-emr-pdx/job_input/seagate/lib/updates.jar /usr/lib/hive/lib/ diff --git a/aws/emr/bootstrap/dummy_step_1.sh b/aws/emr/bootstrap/dummy_step_1.sh new file mode 100644 index 0000000..4825eea --- /dev/null +++ b/aws/emr/bootstrap/dummy_step_1.sh @@ -0,0 +1,4 @@ +#!/bin/bash +set -e +echo "Hallo!" +echo "Ich heisse dummkopf Stepf eins!" diff --git a/aws/emr/bootstrap/dummy_step_2.sh b/aws/emr/bootstrap/dummy_step_2.sh new file mode 100644 index 0000000..5993ae0 --- /dev/null +++ b/aws/emr/bootstrap/dummy_step_2.sh @@ -0,0 +1,4 @@ +#!/bin/bash +set -e +echo "Hallo!" +echo "Ich heisse dummkopf Stepf zwei!" diff --git a/aws/emr/bootstrap/emr_bootstrap_base.sh b/aws/emr/bootstrap/emr_bootstrap_base.sh new file mode 100644 index 0000000..d2b3e2b --- /dev/null +++ b/aws/emr/bootstrap/emr_bootstrap_base.sh @@ -0,0 +1,174 @@ +#!/bin/bash + +set -x + +install_jupyter=false + +# get input parameters +while [ $# -gt 0 ]; do + case "$1" in + --jupyter) + install_jupyter=true + ;; + -*) + error_msg "unrecognized option: $1" + ;; + *) + break; + ;; + esac + shift +done + +s3_utils='https://s3.amazonaws.com/ty-emr/XRR/utils' +build_vim=false + +if [ "$USER" == "root" ]; then + util_path="/root/.utils" + home="/root" +else + if [ "$platform" == "Darwin" ]; then + users_dir="Users" + fi + if [ "$platform" == "Linux" ]; then + users_dir="home" + fi + util_path="/$users_dir/$USER/.utils" +fi +vim_check=`vim --version` +if [[ $vim_check != *"+lua"* ]]; then + build_vim=true +fi +mkdir -p $util_path + +release=`cat /etc/*release* | tr '[:upper:]' '[:lower:]'` + if [[ $release == *"rhel fedora"* ]]; then + echo "Looks like we're running on something that is kinda like RHEL..." + sudo yum groupinstall -y "Development Tools" + sudo yum install -y \ + tmux \ + wget \ + htop \ + mlocate \ + git \ + rake \ + zsh \ + jq \ + at \ + bind-utils \ + strace \ + lua \ + lua-devel \ + ncurses \ + ncurses-devel \ + gmp \ + gmp-devel \ + ctags \ + tcl-devel \ + perl \ + perl-devel \ + perl-ExtUtils-ParseXS \ + perl-ExtUtils-CBuilder \ + perl-ExtUtils-Embed + + if [[ $build_vim == true ]]; then + cd /tmp + git clone http://luajit.org/git/luajit-2.0.git + cd luajit-2.0 + make + sudo make install + + cd /tmp + git clone https://github.com/vim/vim.git + cd vim + ./configure \ + --with-features=huge \ + --enable-cscope \ + --enable-pythoninterp \ + --enable-luainterp \ + --enable-multibyte \ + --enable-fontset \ + --disable-gui \ + --without-x \ + --disable-netbeans \ + --enable-largefile + make + sudo make install + + if [ -e /usr/bin/vi ]; then + sudo rm /usr/bin/vi + fi + sudo ln -s /usr/local/bin/vim /usr/bin/vi + rm -rf /tmp/vim + fi + fi + + + wget https://bootstrap.pypa.io/get-pip.py + sudo python2.7 ./get-pip.py + sudo env "PATH=$PATH" pip install awscli + wget "https://raw.githubusercontent.com/o0beaner/dotfiles/master/install.sh" + chmod +x install.sh + ./install.sh + sudo chmod 644 /usr/bin/chsh + sudo chmod +x /usr/bin/chsh + sudo /usr/bin/chsh -s /bin/zsh $USER + sudo updatedb + cd $util_path + wget --no-check-certificate $s3_utils/suntracker.sh + chmod +x $util_path/suntracker.sh + (crontab -l ; echo "0 3 * * * $util_path/suntracker.sh") | crontab - + $util_path/suntracker.sh +# AWS Instance customization +if [ -e /usr/bin/cloud-init ]; then + + # Install SSM Agent + cd /tmp + sudo yum install -y https://s3.amazonaws.com/ec2-downloads-windows/SSMAgent/latest/linux_amd64/amazon-ssm-agent.rpm + + # Am I running EMR? + instance=`aws ec2 describe-instances --instance-ids $(curl -s 169.254.169.254/latest/meta-data/instance-id)` + tags=`echo $instance | jq -r '.Reservations[0].Instances[0].Tags[]'` + cluster_id=`echo $tags | jq -r '. | select(.Key=="aws:elasticmapreduce:job-flow-id") | .Value'` + if [ -n "$cluster_id" ]; then + echo "$cluster_id" > ~/.cluster_id + role=`echo $tags | jq -r '. | select(.Key=="aws:elasticmapreduce:instance-group-role") | .Value'` + + if [ "$role" == "MASTER" ]; then + # ToDo: Incorporate Hue? + + cd $util_path + wget --no-check-certificate $s3_utils/configure_zeppelin_s3.sh + chmod +x $util_path/configure_zeppelin_s3.sh + aws emr add-steps --cluster-id $cluster_id --steps Type=CUSTOM_JAR,Name="Configure Zeppelin for S3",Jar="command-runner.jar",Args=[$util_path/configure_zeppelin_s3.sh] + fi + + # install jupyter + if [ $install_jupyter == true ]; then + cd $util_path + wget --no-check-certificate https://s3.amazonaws.com/aws-bigdata-blog/artifacts/aws-blog-emr-jupyter/install-jupyter-emr5.sh + chmod +x $util_path/install-jupyter-emr5.sh + $util_path/install-jupyter-emr5.sh \ + --r \ + --julia \ + --toree \ + --torch \ + --ruby \ + --ds-packages \ + --ml-packages \ + --python-packages ggplot nilearn \ + --port 8002 \ + --password jupyter \ + --jupyterhub \ + --jupyterhub-port 8001 \ + --cached-install \ + --notebook-dir s3://ty-emr/XRR/jupyter/notebooks/ \ + --copy-samples \ + --s3fs + fi + fi +fi + +touch ~/.zsh.prompts +mkdir ~/.zsh.after/ +echo "prompt agnoster" > ~/.zsh.after/prompt.zsh diff --git a/aws/emr/bootstrap/hue_aws.json b/aws/emr/bootstrap/hue_aws.json new file mode 100644 index 0000000..cc70e57 --- /dev/null +++ b/aws/emr/bootstrap/hue_aws.json @@ -0,0 +1,21 @@ +[{ + "Classification": "hue-ini", + "Properties": {}, + "Configurations": [ + { + "Classification": "aws", + "Properties": {}, + "Configurations": [ + { + "Classification": "aws_accounts", + "Properties": {}, + "Configurations": [ + { + "Classification": "default", + "Properties": { + "region": "us-east-1" + } + }] + }] + }] +}] diff --git a/aws/emr/bootstrap/hue_ldap.json b/aws/emr/bootstrap/hue_ldap.json new file mode 100644 index 0000000..6dac341 --- /dev/null +++ b/aws/emr/bootstrap/hue_ldap.json @@ -0,0 +1,46 @@ +[ + { + "Classification": "hue-ini", + "Properties": {}, + "Configurations": [ + { + "Classification": "desktop", + "Properties": {}, + "Configurations": [ + { + "Classification": "ldap", + "Properties": { + "create_users_on_login": "false" + }, + "Configurations": [ + { + "Classification": "ldap_servers", + "Properties": {}, + "Configurations": [ + { + "Classification": "blipsandchitz.local", + "Properties": { + "base_dn": "DC=blipsandchitz,DC=local", + "ldap_url": "ldap://10.0.8.254:389", + "search_bind_authentication": "true", + "bind_dn": "CN=hue,OU=ServiceAccounts,OU=UserAccounts,DC=blipsandchitz,DC=local", + "bind_password": "Badpassword1" + }, + "Configurations": [] + } + ] + } + ] + }, + { + "Classification": "auth", + "Properties": { + "backend": "desktop.auth.backend.LdapBackend,desktop.auth.backend.AllowFirstUserDjangoBackend" + }, + "Configurations": [] + } + ] + } + ] + } +] diff --git a/aws/emr/bootstrap/hue_ldap_group.json b/aws/emr/bootstrap/hue_ldap_group.json new file mode 100644 index 0000000..9acd8aa --- /dev/null +++ b/aws/emr/bootstrap/hue_ldap_group.json @@ -0,0 +1,63 @@ +[ + { + "Classification": "hue-ini", + "Properties": {}, + "Configurations": [ + { + "Classification": "desktop", + "Properties": {}, + "Configurations": [ + { + "Classification": "ldap", + "Properties": { + "create_users_on_login": "false" + }, + "Configurations": [ + { + "Classification": "ldap_servers", + "Properties": {}, + "Configurations": [ + { + "Classification": "blipsandchitz.local", + "Properties": { + "base_dn": "DC=blipsandchitz,DC=local", + "ldap_url": "ldap://10.0.8.254:389", + "search_bind_authentication": "true", + "bind_dn": "CN=hue,OU=ServiceAccounts,OU=UserAccounts,DC=blipsandchitz,DC=local", + "bind_password": "Badpassword1" + }, + "Configurations": [] + } + ] + }, + { + "Classification": "users", + "Properties": { + "user_filter": "objectclass=person", + "user_name_attr": "uid" + }, + "Configurations": [] + }, + { + "Classification": "groups", + "Properties": { + "group_filter": "objectclass=groupOfUniqueNames", + "group_name_attr": "cn", + "group_member_attr": "uniqueMember" + }, + "Configurations": [] + } + ] + }, + { + "Classification": "auth", + "Properties": { + "backend": "desktop.auth.backend.LdapBackend,desktop.auth.backend.AllowFirstUserDjangoBackend" + }, + "Configurations": [] + } + ] + } + ] + } +] diff --git a/aws/emr/bootstrap/hue_ldap_group_weirdauth.json b/aws/emr/bootstrap/hue_ldap_group_weirdauth.json new file mode 100644 index 0000000..c165bf9 --- /dev/null +++ b/aws/emr/bootstrap/hue_ldap_group_weirdauth.json @@ -0,0 +1,63 @@ +[ + { + "Classification": "hue-ini", + "Properties": {}, + "Configurations": [ + { + "Classification": "desktop", + "Properties": {}, + "Configurations": [ + { + "Classification": "ldap", + "Properties": { + "create_users_on_login": "false" + }, + "Configurations": [ + { + "Classification": "ldap_servers", + "Properties": {}, + "Configurations": [ + { + "Classification": "blipsandchitz.local", + "Properties": { + "base_dn": "DC=blipsandchitz,DC=local", + "ldap_url": "ldap://10.0.8.254:389", + "search_bind_authentication": "true", + "bind_dn": "CN=hue,OU=ServiceAccounts,OU=UserAccounts,DC=blipsandchitz,DC=local", + "bind_password": "Badpassword1" + }, + "Configurations": [] + } + ] + }, + { + "Classification": "users", + "Properties": { + "user_filter": "objectclass=person", + "user_name_attr": "uid" + }, + "Configurations": [] + }, + { + "Classification": "groups", + "Properties": { + "group_filter": "objectclass=groupOfUniqueNames", + "group_name_attr": "cn", + "group_member_attr": "uniqueMember" + }, + "Configurations": [] + } + ] + }, + { + "Classification": "auth", + "Properties": { + "backend": "desktop.auth.backend.AllowFirstUserDjangoBackend" + }, + "Configurations": [] + } + ] + } + ] + } +] diff --git a/aws/emr/bootstrap/install-jupyter-emr5-payload.sh b/aws/emr/bootstrap/install-jupyter-emr5-payload.sh new file mode 100644 index 0000000..c6afd22 --- /dev/null +++ b/aws/emr/bootstrap/install-jupyter-emr5-payload.sh @@ -0,0 +1,832 @@ +#!/bin/bash +set -x -e + +# AWS EMR bootstrap script +# for installing Jupyter notebook on AWS EMR 5+ +# +# 2016-11-04 - Tom Zeng tomzeng@amazon.com, initial version +# 2016-11-20 - Tom Zeng, add JupyterHub +# 2016-12-01 - Tom Zeng, add s3 support and cached install +# 2016-12-03 - Tom Zeng, use puppet to install/run services +# 2016-12-06 - Tom Zeng, switch to s3fs for S3 support since s3nb is not fully working +# 2016-12-29 - Tom Zeng, add Dask and Dask.distributed +# 2017-04-18 - Tom Zeng, add BigDL support +# 2017-05-16 = Tom Zeng, add cached install for EMR 5.5, updated yum rpm cache and miniCRAN +# 2017-05-20 - Tom Zeng, add s3contents to replace s3nb which no longer works due to Jupyter update +# 2017-05-23 - Tom Zeng, fix the s3contents dummy last_modified field +# 2017-05-25 - Tom Zeng, turn off tensorflow, pip wheel install no longer working, will fix later +# 2017-06-09 - Tom Zeng, fix install issue for EMR 5.6 caused by kernel source package already installed + +# +# Usage: +# --r - install the IRKernel for R (Sparklyr is installed with this option, but as of 2017-04-05 Sparklyr does not support Spark 2.x yet) +# --toree - install the Apache Toree kernel that supports Scala, PySpark, SQL, SparkR for Apache Spark +# --interpreters - specify Apache Toree interpreters, default is all: "Scala,SQL,PySpark,SparkR" +# --julia - install the IJulia kernel for Julia +# --bigdl - install Intel's BigDL Deep Learning framework +# --ruby - install the iRuby kernel for Ruby +# --torch - intall the iTorch kernel for Torch +# --javascript - install the JavaScript and CoffeeScript kernels (only works for JupyterHub for now) +# --dask - install Dask and Dask.distributed, with the scheduler on master instance and the workers on the slave instances +# --ds-packages - install the Python Data Science related packages (scikit-learn pandas numpy numexpr statsmodels seaborn) +# --ml-packages - install the Python Machine Learning related packages (theano keras tensorflow) +# --python-packages - install specific python packages e.g. "ggplot nilean" +# --port - set the port for Jupyter notebook, default is 8888 +# --user - create a default user for Jupyterhub +# --password - set the password for Jupyter notebook and JupyterHub +# --localhost-only - restrict jupyter to listen on localhost only, default to listen on all ip addresses for the instance +# --jupyterhub - install JupyterHub +# --jupyterhub-port - set the port for JupyterHub, default is 8000 +# --no-jupyter - if JupyterHub is installed, use this to disable Jupyter +# --notebook-dir - specify notebook folder, this could be a local directory or a S3 bucket +# --cached-install - use some cached dependency artifacts on s3 to speed up installation +# --ssl - enable ssl, make sure to use your own cert and key files to get rid of the warning +# --copy-samples - copy sample notebooks to samples sub folder under the notebook folder +# --spark-opts - user supplied Spark options to pass to SPARK_OPTS +# --s3fs - use s3fs instead of s3contents(default) for storing notebooks on s3, s3fs could cause slowness if the s3 bucket has lots of file +# --python3 - install python 3 packages and use python3 + +# check for master node +IS_MASTER=false +if grep isMaster /mnt/var/lib/info/instance.json | grep true; +then + IS_MASTER=true +fi + +# error message +error_msg () +{ + echo 1>&2 "Error: $1" +} + +# some defaults +RUBY_KERNEL=false +R_KERNEL=false +JULIA_KERNEL=false +TOREE_KERNEL=false +TORCH_KERNEL=false +DS_PACKAGES=false +ML_PACKAGES=false +PYTHON_PACKAGES="" +RUN_AS_STEP=false +NOTEBOOK_DIR="" +NOTEBOOK_DIR_S3=false +JUPYTER_PORT=8888 +JUPYTER_PASSWORD="" +JUPYTER_LOCALHOST_ONLY=false +PYTHON3=false +GPU=false +CPU_GPU="cpu" +GPUU="" +JUPYTER_HUB=true +JUPYTER_HUB_PORT=8000 +JUPYTER_HUB_IP="*" +JUPYTER_HUB_DEFAULT_USER="jupyter" +INTERPRETERS="Scala,SQL,PySpark,SparkR" +R_REPOS_LOCAL="file:////mnt/miniCRAN" +R_REPOS_REMOTE="http://cran.rstudio.com" +R_REPOS=$R_REPOS_LOCAL +USE_CACHED_DEPS=true +SSL=false +SSL_OPTS="--no-ssl" +COPY_SAMPES=false +USER_SPARK_OPTS="" +NOTEBOOK_DIR_S3_S3NB=false +NOTEBOOK_DIR_S3_S3CONTENTS=true +JS_KERNEL=false +NO_JUPYTER=false +INSTALL_DASK=false +INSTALL_PY3_PKGS=false +APACHE_SPARK_VERSION="2.2.0" +BIGDL=false +MXNET=false +DL4J=false + +# get input parameters +while [ $# -gt 0 ]; do + case "$1" in + --r) + R_KERNEL=true + ;; + --julia) + JULIA_KERNEL=true + ;; + --toree) + TOREE_KERNEL=true + ;; + --torch) + TORCH_KERNEL=true + ;; + --javascript) + JS_KERNEL=true + ;; + --ds-packages) + DS_PACKAGES=true + ;; + --ml-packages) + ML_PACKAGES=true + ;; + --python-packages) + shift + PYTHON_PACKAGES=$1 + ;; + --bigdl) + BIGDL=true + ;; + --mxnet) + MXNET=true + ;; + --dl4j) + DL4J=true + ;; + --ruby) + RUBY_KERNEL=true + ;; + --gpu) + GPU=true + CPU_GPU="gpu" + GPUU="_gpu" + ;; + --run-as-step) + RUN_AS_STEP=true + ;; + --port) + shift + JUPYTER_PORT=$1 + ;; + --user) + shift + JUPYTER_HUB_DEFAULT_USER=$1 + ;; + --password) + shift + JUPYTER_PASSWORD=$1 + ;; + --localhost-only) + JUPYTER_LOCALHOST_ONLY=true + JUPYTER_HUB_IP="" + ;; + --jupyterhub) + JUPYTER_HUB=true + #PYTHON3=true + ;; + --jupyterhub-port) + shift + JUPYTER_HUB_PORT=$1 + ;; + --notebook-dir) + shift + NOTEBOOK_DIR=$1 + ;; + --copy-samples) + COPY_SAMPLES=true + ;; + --toree-interpreters) + shift + INTERPRETERS=$1 + ;; + --cached-install) + USE_CACHED_DEPS=true + R_REPOS=$R_REPOS_LOCAL + ;; + --no-cached-install) + USE_CACHED_DEPS=false + R_REPOS=$R_REPOS_REMOTE + ;; + --no-jupyter) + NO_JUPYTER=true + ;; + --ssl) + SSL=true + ;; + --dask) + INSTALL_DASK=true + ;; + --python3) + INSTALL_PY3_PKGS=true + ;; + --spark-opts) + shift + USER_SPARK_OPTS=$1 + ;; + --spark-version) + shift + APACHE_SPARK_VERSION=$1 + ;; + --s3fs) + #NOTEBOOK_DIR_S3_S3NB=false + NOTEBOOK_DIR_S3_S3CONTENTS=false + ;; + #--s3nb) # this stopped working after Jupyter update in early 2017 + # NOTEBOOK_DIR_S3_S3NB=true + # ;; + -*) + # do not exit out, just note failure + error_msg "unrecognized option: $1" + ;; + *) + break; + ;; + esac + shift +done + + + + + +RELEASE=$(cat /etc/system-release) +REL_NUM=$(ruby -e "puts '$RELEASE'.split.last") + + +sudo mkdir -p /mnt/var/aws/emr +sudo cp -pr /var/aws/emr/packages /mnt/var/aws/emr/ && sudo rm -rf /var/aws/emr/packages && sudo mkdir /var/aws/emr/packages && sudo mount -o bind /mnt/var/aws/emr/packages /var/aws/emr/packages & + +# move /usr/local and usr/share to /mnt/usr-moved/ to avoid running out of space on / +if [ ! -d /mnt/usr-moved ]; then + echo "move local start" >> /tmp/install_time.log + date >> /tmp/install_time.log + sudo mkdir /mnt/usr-moved + sudo mv /usr/local /mnt/usr-moved/ && sudo ln -s /mnt/usr-moved/local /usr/ + echo "move local end, move share start" >> /tmp/install_time.log + date >> /tmp/install_time.log + sudo mv /usr/share /mnt/usr-moved/ && sudo ln -s /mnt/usr-moved/share /usr/ + echo "move share end" >> /tmp/install_time.log + date >> /tmp/install_time.log +fi + +export MAKE='make -j 8' + +export NODE_PATH='/usr/lib/node_modules' +if [ "$JS_KERNEL" = true ]; then + sudo python -m pip install -U jinja2 tornado jsonschema pyzmq + sudo npm cache clean -f + sudo npm install -g npm + sudo npm install -g n + sudo n stable +fi + +cd /mnt + +TF_BINARY_URL_PY3="https://storage.googleapis.com/tensorflow/linux/$CPU_GPU/tensorflow$GPUU-1.1.0-cp34-cp34m-linux_x86_64.whl" +TF_BINARY_URL="https://storage.googleapis.com/tensorflow/linux/$CPU_GPU/tensorflow$GPUU-1.1.0-cp27-none-linux_x86_64.whl" + + + + +if [ "$DS_PACKAGES" = true ]; then + # Python + if [ "$INSTALL_PY3_PKGS" = true ]; then + sudo python3 -m pip install -U scikit-learn pandas numpy numexpr statsmodels scipy + else + sudo python -m pip install -U scikit-learn pandas numpy numexpr statsmodels scipy + fi + # Javascript + if [ "$JS_KERNEL" = true ]; then + sudo npm install -g --unsafe-perm stats-analysis decision-tree machine_learning limdu synaptic node-svm lda brain.js scikit-node + fi +fi + +if [ "$ML_PACKAGES" = true ]; then + if [ "$INSTALL_PY3_PKGS" = true ]; then + sudo python3 -m pip install -U theano + sudo python3 -m pip install -U keras + sudo python3 -m pip install -U $TF_BINARY_URL_PY3 + else + sudo python -m pip install -U theano + sudo python -m pip install -U keras + sudo python -m pip install -U $TF_BINARY_URL + fi +fi + +if [ ! "$PYTHON_PACKAGES" = "" ]; then + if [ "$INSTALL_PY3_PKGS" = true ]; then + sudo python3 -m pip install -U $PYTHON_PACKAGES || true + else + sudo python -m pip install -U $PYTHON_PACKAGES || true + fi +fi + +if [ "$BIGDL" = true ]; then + aws s3 cp s3://tomzeng/maven/apache-maven-3.3.3-bin.tar.gz . + tar xvfz apache-maven-3.3.3-bin.tar.gz + sudo mv apache-maven-3.3.3 /opt/maven + sudo ln -s /opt/maven/bin/mvn /usr/bin/mvn + + git clone https://github.com/intel-analytics/BigDL.git + cd BigDL/ + export MAVEN_OPTS="-Xmx2g -XX:ReservedCodeCacheSize=512m" + export BIGDL_HOME=/mnt/BigDL + export BIGDL_VER="0.2.0-SNAPSHOT" + bash make-dist.sh -P spark_2.1 + mkdir /tmp/bigdl_summaries + /usr/local/bin/tensorboard --debug INFO --logdir /tmp/bigdl_summaries/ > /tmp/tensorboard_bigdl.log 2>&1 & +fi + +if [ "$JULIA_KERNEL" = true ]; then + # Julia install + cd /mnt + if [ ! "$USE_CACHED_DEPS" = true ]; then + wget https://julialang.s3.amazonaws.com/bin/linux/x64/0.5/julia-0.5.0-linux-x86_64.tar.gz + tar xvfz julia-0.5.0-linux-x86_64.tar.gz + fi + cd julia-3c9d75391c + sudo cp -pr bin/* /usr/bin/ + sudo cp -pr lib/* /usr/lib/ + #sudo cp -pr libexec/* /usr/libexec/ + sudo cp -pr share/* /usr/share/ + sudo cp -pr include/* /usr/include/ +fi + +if [ "$INSTALL_DASK" = true ]; then + if [ "$INSTALL_PY3_PKGS" = true ]; then + sudo python3 -m pip install -U dask[complete] distributed + else + sudo python -m pip install -U dask[complete] distributed + fi + export PATH=$PATH:/usr/local/bin + if [ "$IS_MASTER" = true ]; then + dask-scheduler > /var/log/dask-scheduler.log 2>&1 & + else + MASTER_KV=$(grep masterHost /emr/instance-controller/lib/info/job-flow-state.txt) + MASTER_HOST=$(ruby -e "puts '$MASTER_KV'.gsub('\"','').split.last") + dask-worker $MASTER_HOST:8786 > /var/log/dask-worker.log 2>&1 & + fi +fi + +#echo ". /mnt/ipython-env/bin/activate" >> ~/.bashrc + +# only run below on master instance +if [ "$IS_MASTER" = true ]; then + + + +sudo mkdir -p /var/log/jupyter +mkdir -p ~/.jupyter +touch ls ~/.jupyter/jupyter_notebook_config.py + +sed -i '/c.NotebookApp.open_browser/d' ~/.jupyter/jupyter_notebook_config.py +echo "c.NotebookApp.open_browser = False" >> ~/.jupyter/jupyter_notebook_config.py + +if [ ! "$JUPYTER_LOCALHOST_ONLY" = true ]; then +sed -i '/c.NotebookApp.ip/d' ~/.jupyter/jupyter_notebook_config.py +echo "c.NotebookApp.ip='*'" >> ~/.jupyter/jupyter_notebook_config.py +fi + +sed -i '/c.NotebookApp.port/d' ~/.jupyter/jupyter_notebook_config.py +echo "c.NotebookApp.port = $JUPYTER_PORT" >> ~/.jupyter/jupyter_notebook_config.py + +if [ ! "$JUPYTER_PASSWORD" = "" ]; then + sed -i '/c.NotebookApp.password/d' ~/.jupyter/jupyter_notebook_config.py + HASHED_PASSWORD=$(python3 -c "from notebook.auth import passwd; print(passwd('$JUPYTER_PASSWORD'))") + echo "c.NotebookApp.password = u'$HASHED_PASSWORD'" >> ~/.jupyter/jupyter_notebook_config.py +else + sed -i '/c.NotebookApp.token/d' ~/.jupyter/jupyter_notebook_config.py + echo "c.NotebookApp.token = u''" >> ~/.jupyter/jupyter_notebook_config.py +fi + +echo "c.Authenticator.admin_users = {'$JUPYTER_HUB_DEFAULT_USER'}" >> ~/.jupyter/jupyter_notebook_config.py +echo "c.LocalAuthenticator.create_system_users = True" >> ~/.jupyter/jupyter_notebook_config.py + +if [ "$SSL" = true ]; then + #NOTE - replace server.cert and server.key with your own cert and key files + CERT=/usr/local/etc/server.cert + KEY=/usr/local/etc/server.key + sudo openssl req -x509 -nodes -days 3650 -newkey rsa:1024 -keyout $KEY -out $CERT -subj "/C=US/ST=Washington/L=Seattle/O=JupyterCert/CN=JupyterCert" + + # the following works for Jupyter but will fail JupyterHub, use options for both instead + #echo "c.NotebookApp.certfile = u'/usr/local/etc/server.cert'" >> ~/.jupyter/jupyter_notebook_config.py + #echo "c.NotebookApp.keyfile = u'/usr/local/etc/server.key'" >> ~/.jupyter/jupyter_notebook_config.py + + SSL_OPTS_JUPYTER="--keyfile=/usr/local/etc/server.key --certfile=/usr/local/etc/server.cert" + SSL_OPTS_JUPYTERHUB="--ssl-key=/usr/local/etc/server.key --ssl-cert=/usr/local/etc/server.cert" +fi + + +# Javascript/CoffeeScript kernels +if [ "$JS_KERNEL" = true ]; then + sudo npm install -g --unsafe-perm ijavascript d3 lodash plotly jp-coffeescript + sudo ijs --ijs-install=global + sudo jp-coffee --jp-install=global +fi + + + +if [ "$JULIA_KERNEL" = true ]; then + julia -e 'Pkg.add("IJulia")' + julia -e 'Pkg.add("RDatasets");Pkg.add("Gadfly");Pkg.add("DataFrames");Pkg.add("PyPlot")' + # Julia's Spark support does not support Spark on Yarn yet + # install mvn + #cd /mnt + #aws s3 cp s3://tomzeng/maven/apache-maven-3.3.9-bin.tar.gz . + #tar xvfz apache-maven-3.3.9-bin.tar.gz + #sudo mv apache-maven-3.3.9 /opt/maven + #sudo ln -s /opt/maven/bin/mvn /usr/bin/mvn + # install Spark for Julia + #julia -e 'Pkg.clone("https://github.com/dfdx/Spark.jl"); Pkg.build("Spark"); Pkg.checkout("JavaCall")' +fi + +# iTorch depends on Torch which is installed with --ml-packages +if [ "$TORCH_KERNEL" = true ]; then + set +e # workaround for the lengthy torch install-deps, esp when other background process are also running yum + cd /mnt + if [ ! "$USE_CACHED_DEPS" = true ]; then + git clone https://github.com/torch/distro.git torch-distro + fi + cd torch-distro + git pull + ./install-deps + ./install.sh -b + export PATH=$PATH:/mnt/torch-distro/install/bin + source ~/.profile + luarocks install lzmq + luarocks install gnuplot + cd /mnt + if [ ! "$USE_CACHED_DEPS" = true ]; then + git clone https://github.com/facebook/iTorch.git + fi + cd iTorch + luarocks make + sudo cp -pr ~/.ipython/kernels/itorch /usr/local/share/jupyter/kernels/ + set -e +fi + + + + +if [ ! "$NOTEBOOK_DIR" = "" ]; then + NOTEBOOK_DIR="${NOTEBOOK_DIR%/}/" # remove trailing / if exists then add / + if [[ "$NOTEBOOK_DIR" == s3://* ]]; then + NOTEBOOK_DIR_S3=true + # the s3nb does not fully working yet(upload and createe folder not working) + # s3nb does not work anymore due to Jupyter update + if [ "$NOTEBOOK_DIR_S3_S3NB" = true ]; then + cd /mnt + if [ ! "$USE_CACHED_DEPS" = true ]; then + git clone https://github.com/tomz/s3nb.git + fi + cd s3nb + sudo python -m pip install -U entrypoints + sudo python setup.py install + if [ "$JUPYTER_HUB" = true ]; then + sudo python3 -m pip install -U entrypoints + sudo python3 setup.py install + fi + + echo "c.NotebookApp.contents_manager_class = 's3nb.S3ContentsManager'" >> ~/.jupyter/jupyter_notebook_config.py + echo "c.S3ContentsManager.checkpoints_kwargs = {'root_dir': '~/.checkpoints'}" >> ~/.jupyter/jupyter_notebook_config.py + # if just bucket with no subfolder, a trailing / is required, otherwise s3nb will break + echo "c.S3ContentsManager.s3_base_uri = '$NOTEBOOK_DIR'" >> ~/.jupyter/jupyter_notebook_config.py + #echo "c.S3ContentsManager.s3_base_uri = '${NOTEBOOK_DIR_S3%/}/%U'" >> ~/.jupyter/jupyter_notebook_config.py + #echo "c.Spawner.default_url = '${NOTEBOOK_DIR_S3%/}/%U'" >> ~/.jupyter/jupyter_notebook_config.py + #echo "c.Spawner.notebook_dir = '/%U'" >> ~/.jupyter/jupyter_notebook_config.py + elif [ "$NOTEBOOK_DIR_S3_S3CONTENTS" = true ]; then + BUCKET=$(ruby -e "puts '$NOTEBOOK_DIR'.split('//')[1].split('/')[0]") + FOLDER=$(ruby -e "puts '$NOTEBOOK_DIR'.split('//')[1].split('/')[1..-1].join('/')") + #sudo python -m pip install -U s3contents + cd /mnt + #aws s3 cp s3://aws-bigdata-blog/artifacts/aws-blog-emr-jupyter/s3contents.zip . + #unzip s3contents.zip + git clone https://github.com/tomz/s3contents.git + cd s3contents + sudo python setup.py install + echo "c.NotebookApp.contents_manager_class = 's3contents.S3ContentsManager'" >> ~/.jupyter/jupyter_notebook_config.py + echo "c.S3ContentsManager.bucket_name = '$BUCKET'" >> ~/.jupyter/jupyter_notebook_config.py + echo "c.S3ContentsManager.prefix = '$FOLDER'" >> ~/.jupyter/jupyter_notebook_config.py + # this following is no longer needed, default was fixed in the latest on github + #echo "c.S3ContentsManager.endpoint_url = 'https://s3.amazonaws.com'" >> ~/.jupyter/jupyter_notebook_config.py + else + BUCKET=$(ruby -e "puts '$NOTEBOOK_DIR'.split('//')[1].split('/')[0]") + FOLDER=$(ruby -e "puts '$NOTEBOOK_DIR'.split('//')[1].split('/')[1..-1].join('/')") + if [ "$USE_CACHED_DEPS" != true ]; then + sudo yum install -y automake fuse fuse-devel libxml2-devel + fi + cd /mnt + git clone https://github.com/s3fs-fuse/s3fs-fuse.git + cd s3fs-fuse/ + ls -alrt + ./autogen.sh + ./configure + make + sudo make install + sudo su -c 'echo user_allow_other >> /etc/fuse.conf' + mkdir -p /mnt/s3fs-cache + mkdir -p /mnt/$BUCKET + #/usr/local/bin/s3fs -o allow_other -o iam_role=auto -o umask=0 $BUCKET /mnt/$BUCKET + # -o nodnscache -o nosscache -o parallel_count=20 -o multipart_size=50 + /usr/local/bin/s3fs -o allow_other -o iam_role=auto -o umask=0 -o url=https://s3.amazonaws.com -o no_check_certificate -o enable_noobj_cache -o use_cache=/mnt/s3fs-cache $BUCKET /mnt/$BUCKET + #/usr/local/bin/s3fs -o allow_other -o iam_role=auto -o umask=0 -o use_cache=/mnt/s3fs-cache $BUCKET /mnt/$BUCKET + echo "c.NotebookApp.notebook_dir = '/mnt/$BUCKET/$FOLDER'" >> ~/.jupyter/jupyter_notebook_config.py + echo "c.ContentsManager.checkpoints_kwargs = {'root_dir': '.checkpoints'}" >> ~/.jupyter/jupyter_notebook_config.py + fi + else + echo "c.NotebookApp.notebook_dir = '$NOTEBOOK_DIR'" >> ~/.jupyter/jupyter_notebook_config.py + echo "c.ContentsManager.checkpoints_kwargs = {'root_dir': '.checkpoints'}" >> ~/.jupyter/jupyter_notebook_config.py + fi +fi + +if [ ! "$JUPYTER_HUB_DEFAULT_USER" = "" ]; then + sudo adduser $JUPYTER_HUB_DEFAULT_USER +fi + +if [ "$COPY_SAMPLES" = true ]; then + cd ~ + if [ "$NOTEBOOK_DIR_S3" = true ]; then + aws s3 sync s3://aws-bigdata-blog/artifacts/aws-blog-emr-jupyter/notebooks/ ${NOTEBOOK_DIR}samples/ || true + else + if [ ! "$NOTEBOOK_DIR" = "" ]; then + mkdir -p ${NOTEBOOK_DIR}samples || true + sudo mkdir /home/$JUPYTER_HUB_DEFAULT_USER/${NOTEBOOK_DIR}samples || true + fi + aws s3 sync s3://aws-bigdata-blog/artifacts/aws-blog-emr-jupyter/notebooks/ ${NOTEBOOK_DIR}samples || true + sudo cp -pr ${NOTEBOOK_DIR}samples /home/$JUPYTER_HUB_DEFAULT_USER/ + sudo chown -R $JUPYTER_HUB_DEFAULT_USER:$JUPYTER_HUB_DEFAULT_USER /home/$JUPYTER_HUB_DEFAULT_USER/${NOTEBOOK_DIR}samples + fi + if [ "$BIGDL" = true ]; then + aws s3 cp s3://aws-bigdata-blog/artifacts/aws-blog-emr-jupyter/notebooks/text_classfication.ipynb ${NOTEBOOK_DIR}. + sudo cp ${NOTEBOOK_DIR}text_classfication.ipynb /home/$JUPYTER_HUB_DEFAULT_USER/${NOTEBOOK_DIR} + sudo chown -R $JUPYTER_HUB_DEFAULT_USER:$JUPYTER_HUB_DEFAULT_USER /home/$JUPYTER_HUB_DEFAULT_USER/${NOTEBOOK_DIR}text_classfication.ipynb + fi +fi + + +wait_for_spark() { + # wait SparkR file to show up + while [ ! -f /etc/spark/conf/spark-defaults.conf ] + do + sleep 10 + done +} + +setup_jupyter_process_with_bigdl() { + wait_for_spark + export PYTHON_API_PATH=${BIGDL_HOME}/dist/lib/bigdl-$BIGDL_VER-python-api.zip + export BIGDL_JAR_PATH=${BIGDL_HOME}/dist/lib/bigdl-$BIGDL_VER-jar-with-dependencies.jar + cat ${BIGDL_HOME}/dist/conf/spark-bigdl.conf | sudo tee -a /etc/spark/conf/spark-defaults.conf + sudo puppet apply << PUPPET_SCRIPT + include 'upstart' + upstart::job { 'jupyter': + description => 'Jupyter', + respawn => true, + respawn_limit => '0 10', + start_on => 'runlevel [2345]', + stop_on => 'runlevel [016]', + console => 'output', + chdir => '/home/hadoop', + script => ' + sudo su - hadoop > /var/log/jupyter/jupyter.log 2>&1 <> /etc/spark/conf/spark-defaults.conf" + fi + + if [ ! -f /tmp/Renvextra ]; then # check if the rstudio BA maybe already done this + cat << 'EOF' > /tmp/Renvextra +JAVA_HOME="/etc/alternatives/jre" +HADOOP_HOME_WARN_SUPPRESS="true" +HADOOP_HOME="/usr/lib/hadoop" +HADOOP_PREFIX="/usr/lib/hadoop" +HADOOP_MAPRED_HOME="/usr/lib/hadoop-mapreduce" +HADOOP_YARN_HOME="/usr/lib/hadoop-yarn" +HADOOP_COMMON_HOME="/usr/lib/hadoop" +HADOOP_HDFS_HOME="/usr/lib/hadoop-hdfs" +HADOOP_CONF_DIR="/usr/lib/hadoop/etc/hadoop" +YARN_CONF_DIR="/usr/lib/hadoop/etc/hadoop" +YARN_HOME="/usr/lib/hadoop-yarn" +HIVE_HOME="/usr/lib/hive" +HIVE_CONF_DIR="/usr/lib/hive/conf" +HBASE_HOME="/usr/lib/hbase" +HBASE_CONF_DIR="/usr/lib/hbase/conf" +SPARK_HOME="/usr/lib/spark" +SPARK_CONF_DIR="/usr/lib/spark/conf" +PATH=${PWD}:${PATH} +EOF + + #if [ "$PYSPARK_PYTHON" = "python3" ]; then + if [ "$INSTALL_PY3_PKGS" = true ]; then + cat << 'EOF' >> /tmp/Renvextra +PYSPARK_PYTHON="python3" +EOF + fi + + cat /tmp/Renvextra | sudo tee -a /usr/lib64/R/etc/Renviron + + sudo mkdir -p /mnt/spark + sudo chmod a+rwx /mnt/spark + if [ -d /mnt1 ]; then + sudo mkdir -p /mnt1/spark + sudo chmod a+rwx /mnt1/spark + fi + + + + set +e # workaround for if SparkR is already installed by other BA + # install SparkR and SparklyR for R - toree ifself does not need this + sudo R --no-save << R_SCRIPT + library(devtools) + install('/usr/lib/spark/R/lib/SparkR') +R_SCRIPT + set -e + + fi # end if -f /tmp/Renvextra + + + export SPARK_HOME="/usr/lib/spark" + SPARK_PACKAGES="" + + PYSPARK_PYTHON="python" + if [ "$INSTALL_PY3_PKGS" = true ]; then + PYSPARK_PYTHON="python3" + fi + + if [ ! "$USER_SPARK_OPTS" = "" ]; then + SPARK_OPTS=$USER_SPARK_OPTS + SPARK_PACKAGES=$(ruby -e "opts='$SPARK_OPTS'.split;pkgs=nil;opts.each_with_index{|o,i| pkgs=opts[i+1] if o.start_with?('--packages')};puts pkgs || '$SPARK_PACKAGES'") + export SPARK_OPTS + export SPARK_PACKAGES + + sudo jupyter toree install --interpreters=$INTERPRETERS --spark_home=$SPARK_HOME --python_exec=$PYSPARK_PYTHON --spark_opts="$SPARK_OPTS" + # NOTE - toree does not pick SPARK_OPTS, so use the following workaround until it's fixed + if [ ! "$SPARK_PACKAGES" = "" ]; then + if ! grep "spark.jars.packages" /etc/spark/conf/spark-defaults.conf; then + sudo bash -c "echo 'spark.jars.packages $SPARK_PACKAGES' >> /etc/spark/conf/spark-defaults.conf" + fi + fi + else + sudo jupyter toree install --interpreters=$INTERPRETERS --spark_home=$SPARK_HOME --python_exec=$PYSPARK_PYTHON + fi + + + if [ "$INSTALL_PY3_PKGS" = true ]; then + sudo bash -c 'echo "" >> /etc/spark/conf/spark-env.sh' + sudo bash -c 'echo "export PYSPARK_PYTHON=/usr/bin/python3" >> /etc/spark/conf/spark-env.sh' + + #if [ -f /usr/local/share/jupyter/kernels/apache_toree_pyspark/kernel.json ]; then + # sudo bash -c 'sed -i "s/\"PYTHON_EXEC\": \"python\"/\"PYTHON_EXEC\": \"\/usr\/bin\/python3\"/g" /usr/local/share/jupyter/kernels/apache_toree_pyspark/kernel.json' + #fi + + fi + + # the following dirs could cause conflict, so remove them + rm -rf ~/.m2/ + rm -rf ~/.ivy2/ + + if [ "$NO_JUPYTER" = false ]; then + echo "Starting Jupyter notebook via pyspark" + cd ~ + #PYSPARK_DRIVER_PYTHON=jupyter PYSPARK_DRIVER_PYTHON_OPTS="notebook --no-browser" pyspark > /var/log/jupyter/jupyter.log & + if [ "$BIGDL" = false ]; then + sudo puppet apply << PUPPET_SCRIPT + include 'upstart' + upstart::job { 'jupyter': + description => 'Jupyter', + respawn => true, + respawn_limit => '0 10', + start_on => 'runlevel [2345]', + stop_on => 'runlevel [016]', + console => 'output', + chdir => '/home/hadoop', + script => ' + sudo su - hadoop > /var/log/jupyter/jupyter.log 2>&1 < 'Jupyter', + respawn => true, + respawn_limit => '0 10', + start_on => 'runlevel [2345]', + stop_on => 'runlevel [016]', + console => 'output', + chdir => '/home/hadoop', + env => { 'NOTEBOOK_DIR' => '$NOTEBOOK_DIR', 'NODE_PATH' => '$NODE_PATH' }, + exec => 'sudo su - hadoop -c "jupyter notebook --no-browser $SSL_OPTS_JUPYTER" > /var/log/jupyter/jupyter.log 2>&1', + } +PUPPET_SCRIPT + else + setup_jupyter_process_with_bigdl & + fi + fi +fi + +if [ "$JUPYTER_HUB" = true ]; then + sudo npm install -g --unsafe-perm configurable-http-proxy + sudo python3 -m pip install jupyterhub #notebook ipykernel + #sudo python3 -m ipykernel install + + if [ ! "$JUPYTER_HUB_DEFAULT_USER" = "" ]; then + create_hdfs_user & + fi + # change the password of the hadoop user to JUPYTER_PASSWORD + if [ ! "$JUPYTER_PASSWORD" = "" ]; then + sudo sh -c "echo '$JUPYTER_PASSWORD' | passwd $JUPYTER_HUB_DEFAULT_USER --stdin" + fi + + sudo ln -sf /usr/local/bin/jupyterhub /usr/bin/ + sudo ln -sf /usr/local/bin/jupyterhub-singleuser /usr/bin/ + mkdir -p /mnt/jupyterhub + cd /mnt/jupyterhub + echo "Starting Jupyterhub" + #sudo jupyterhub $SSL_OPTS_JUPYTERHUB --port=$JUPYTER_HUB_PORT --ip=$JUPYTER_HUB_IP --log-file=/var/log/jupyter/jupyterhub.log --config ~/.jupyter/jupyter_notebook_config.py & + sudo puppet apply << PUPPET_SCRIPT + include 'upstart' + upstart::job { 'jupyterhub': + description => 'JupyterHub', + respawn => true, + respawn_limit => '0 10', + start_on => 'runlevel [2345]', + stop_on => 'runlevel [016]', + console => 'output', + chdir => '/mnt/jupyterhub', + env => { 'NOTEBOOK_DIR' => '$NOTEBOOK_DIR', 'NODE_PATH' => '$NODE_PATH' }, + exec => 'sudo /usr/bin/jupyterhub --pid-file=/var/run/jupyter.pid $SSL_OPTS_JUPYTERHUB --port=$JUPYTER_HUB_PORT --ip=$JUPYTER_HUB_IP --log-file=/var/log/jupyter/jupyterhub.log --config /home/hadoop/.jupyter/jupyter_notebook_config.py' + } +PUPPET_SCRIPT + +fi + +cat << 'EOF' > /tmp/jupyter_logpusher.config +{ + "/var/log/jupyter/" : { + "includes" : [ "(.*)" ], + "s3Path" : "node/$instance-id/applications/jupyter/$0", + "retentionPeriod" : "5d", + "logType" : [ "USER_LOG", "SYSTEM_LOG" ] + } +} +EOF +cat /tmp/jupyter_logpusher.config | sudo tee -a /etc/logpusher/jupyter.config + +fi +echo "Bootstrap action finished" diff --git a/aws/emr/bootstrap/install_jupyter.sh b/aws/emr/bootstrap/install_jupyter.sh new file mode 100644 index 0000000..82b7f7c --- /dev/null +++ b/aws/emr/bootstrap/install_jupyter.sh @@ -0,0 +1,23 @@ +#!/bin/bash + + wget --no-check-certificate https://s3.amazonaws.com/aws-bigdata-blog/artifacts/aws-blog-emr-jupyter/install-jupyter-emr5.sh + aws s3 cp s3://ty-emr/XRR/bootstrap/install-jupyter-emr5-payload.sh . +chmod +x $util_path/install-jupyter-emr5-payload.sh +chmod +x install-jupyter-emr5-payload.sh +./install-jupyter-emr5-payload.sh \ + --r \ + --julia \ + --toree \ + --torch \ + --ruby \ + --ds-packages \ + --ml-packages \ + --python-packages ggplot nilearn \ + --port 8002 \ + --password jupyter \ + --jupyterhub \ + --jupyterhub-port 8001 \ + --cached-install \ + --notebook-dir s3://ty-emr/XRR/jupyter/notebooks/ \ + --copy-samples \ + --s3fs diff --git a/aws/emr/bootstrap/install_sift.sh b/aws/emr/bootstrap/install_sift.sh new file mode 100644 index 0000000..fb1cf25 --- /dev/null +++ b/aws/emr/bootstrap/install_sift.sh @@ -0,0 +1,7 @@ +#!/bin/sh + +cd ~ +wget https://sift-tool.org/downloads/sift/sift_0.9.0_linux_amd64.tar.gz +tar xvzf sift* +sudo mv sift*/sift /usr/local/bin/ +rm -rf sift* diff --git a/aws/emr/bootstrap/instance-controller/instance-controller-interface.jar b/aws/emr/bootstrap/instance-controller/instance-controller-interface.jar new file mode 100644 index 0000000..3ef86d3 Binary files /dev/null and b/aws/emr/bootstrap/instance-controller/instance-controller-interface.jar differ diff --git a/aws/emr/bootstrap/instance-controller/instance-controller.jar b/aws/emr/bootstrap/instance-controller/instance-controller.jar new file mode 100644 index 0000000..599670a Binary files /dev/null and b/aws/emr/bootstrap/instance-controller/instance-controller.jar differ diff --git a/aws/emr/bootstrap/presto-postgres.sh b/aws/emr/bootstrap/presto-postgres.sh new file mode 100644 index 0000000..36339e4 --- /dev/null +++ b/aws/emr/bootstrap/presto-postgres.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +configure_postgres() { + while ! status presto-server | grep -q running 2>/dev/null + do + sleep 1 + done + aws s3 cp s3://ty-emr/XRR/presto/pg1.properties ~ + aws s3 cp s3://ty-emr/XRR/presto/pg2.properties ~ + sudo mv ~/*.properties /etc/presto/conf/catalog + sudo restart presto-server +} + +configure_postgres & diff --git a/aws/emr/bootstrap/replace-instance-controller-and-interface.sh b/aws/emr/bootstrap/replace-instance-controller-and-interface.sh new file mode 100644 index 0000000..12a6875 --- /dev/null +++ b/aws/emr/bootstrap/replace-instance-controller-and-interface.sh @@ -0,0 +1,7 @@ +[ -e /mnt/replaced-instance-controller ] && exit 0 +sudo find /usr/share/aws/emr/instance-controller/lib -name 'instance-controller*.jar' -exec rm {} \; +sudo aws s3 cp s3://ty-emr/XRR/bootstrap/instance-controller/instance-controller.jar /usr/share/aws/emr/instance-controller/lib/ +sudo aws s3 cp s3://ty-emr/XRR/bootstrap/instance-controller/instance-controller-interface.jar /usr/share/aws/emr/instance-controller/lib/ +touch /mnt/replaced-instance-controller +sudo service instance-controller stop +sudo service logpusher stop diff --git a/aws/emr/bootstrap/replace-instance-controller.sh b/aws/emr/bootstrap/replace-instance-controller.sh new file mode 100644 index 0000000..02cd21b --- /dev/null +++ b/aws/emr/bootstrap/replace-instance-controller.sh @@ -0,0 +1,6 @@ +[ -e /mnt/replaced-instance-controller ] && exit 0 +sudo find /usr/share/aws/emr/instance-controller/lib -name 'instance-controller*.jar' -and -not -name '*interface*' -exec rm {} \; +sudo aws s3 cp s3://ty-emr/XRR/bootstrap/instance-controller/instance-controller.jar /usr/share/aws/emr/instance-controller/lib/ +touch /mnt/replaced-instance-controller +sudo service instance-controller stop +sudo service logpusher stop