-
Notifications
You must be signed in to change notification settings - Fork 5
/
ann_benchmark_quantization_aws.sh
executable file
·137 lines (111 loc) · 5.05 KB
/
ann_benchmark_quantization_aws.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
#!/bin/bash
set -e
MACHINE_TYPE="${MACHINE_TYPE:-"m6i.2xlarge"}"
CLOUD_PROVIDER="aws"
OS="ubuntu-2204"
ARCH=amd64
dataset=${DATASET:-"sift-128-euclidean"}
distance=${DISTANCE:-"l2-squared"}
region="eu-central-1"
# to make sure all aws resources are unique
run_id=$(uuidgen | tr [:upper:] [:lower:])
key_id="key-$run_id"
vpc_id=$(aws ec2 describe-vpcs --region $region | jq -r '.Vpcs[0].VpcId')
# subnet_id=$( aws ec2 describe-subnets --region $region --filters=Name=vpc-id,Values=$vpc_id | jq -r '.Subnets[3].SubnetId')
group_id=$(aws ec2 create-security-group --group-name "benchmark-run-$run_id" --description "created for benchmark run $run_id" --vpc-id $vpc_id --region $region | jq -r '.GroupId'
)
aws ec2 authorize-security-group-ingress --ip-permissions '[ { "IpProtocol": "tcp", "FromPort": 22, "ToPort": 22, "IpRanges": [ { "CidrIp": "0.0.0.0/0" } ] } ]' --group-id $group_id --region $region | jq
ami=$(aws ec2 describe-images --region $region --owner amazon --filter "Name=name,Values=ubuntu/images/hvm-ssd/ubuntu*22.04*${ARCH}*" | jq -r '.Images[0].ImageId')
aws ec2 create-key-pair --key-name "$key_id" --region "$region" | jq -r '.KeyMaterial' > "${key_id}.pem"
chmod 600 "${key_id}.pem"
instance_id=$(aws ec2 run-instances --image-id $ami --count 1 --instance-type $MACHINE_TYPE --key-name $key_id --security-group-ids $group_id --region $region --associate-public-ip-address --cli-read-timeout 600 --ebs-optimized --block-device-mapping "[ { \"DeviceName\": \"/dev/sda1\", \"Ebs\": { \"VolumeSize\": 120 } } ]" | jq -r '.Instances[0].InstanceId' )
echo "instance ready: $instance_id"
function cleanup() {
set +e # Continue cleanup even if individual commands fail
if [ ! -z "$instance_id" ]; then
echo "Terminating instance $instance_id"
aws ec2 terminate-instances --instance-ids "$instance_id" --region "$region" | jq || true
# Busy loop to wait for instance termination with timeout
echo "Waiting for instance to terminate..."
SECONDS=0
timeout=300
while [ $SECONDS -lt $timeout ]; do
status=$(aws ec2 describe-instances --instance-ids "$instance_id" --region "$region" | jq -r '.Reservations[0].Instances[0].State.Name' || echo "error")
if [ "$status" = "terminated" ]; then
echo "Instance successfully terminated"
break
elif [ "$status" = "error" ]; then
echo "Instance not found - assuming terminated"
break
fi
echo "Instance status: $status"
sleep 5
SECONDS=$((SECONDS + 5))
done
if [ $SECONDS -ge $timeout ]; then
echo "Error: Timeout waiting for instance termination. Please check AWS instances for manual cleanup."
exit 1
fi
fi
if [ ! -z "$key_id" ]; then
echo "Deleting key pair $key_id"
aws ec2 delete-key-pair --key-name "$key_id" --region "$region" | jq || true
rm -f "${key_id}.pem" || true
fi
if [ ! -z "$group_id" ]; then
echo "Deleting security group $group_id"
# Add retry loop for security group deletion since it might fail if instance is still terminating
for i in {1..6}; do
if aws ec2 delete-security-group --group-id "$group_id" --region "$region" | jq; then
break
fi
echo "Retrying security group deletion in 10 seconds..."
sleep 10
done
fi
}
trap cleanup EXIT SIGINT SIGTERM ERR
dns_name=
for i in {1..600}; do
dns_name=$(aws ec2 describe-instances --filters "Name=instance-id,Values=$instance_id" --region $region | jq -r '.Reservations[0].Instances[0].PublicDnsName')
if [ ! -z "$dns_name" ] && [ "$dns_name" != "null" ]; then
if ssh-keyscan $dns_name > /dev/null; then
break
fi
fi
sleep 1
done
if [ -z "$dns_name" ] || [ "$dns_name" == "null" ]; then
echo "did not receive dns name in time"
exit 1
fi
ssh_addr="ubuntu@$dns_name"
echo "${key_id}.pem"
echo "$ssh_addr"
mkdir -p ~/.ssh
ssh-keyscan "$dns_name" >> ~/.ssh/known_hosts
echo "Added hosts"
# Busy loop to wait for the instance to be fully booted up with a timeout of 5 minutes
SECONDS=0
timeout=300
while [ $SECONDS -lt $timeout ]; do
if ssh -i "${key_id}.pem" $ssh_addr -- 'echo "System is ready"'; then
break
fi
sleep 1
SECONDS=$((SECONDS + 1))
done
if [ $SECONDS -ge $timeout ]; then
echo "Timeout: VM is not SSH'able after 300 seconds"
exit 1
fi
scp -i "${key_id}.pem" -r install_docker_ubuntu.sh "$ssh_addr:~"
ssh -i "${key_id}.pem" $ssh_addr -- 'sh install_docker_ubuntu.sh'
ssh -i "${key_id}.pem" $ssh_addr -- 'sudo sudo groupadd docker; sudo usermod -aG docker $USER'
ssh -i "${key_id}.pem" $ssh_addr -- "mkdir -p ~/apps/"
scp -i "${key_id}.pem" -r apps/ann-benchmarks "$ssh_addr:~/apps/"
scp -i "${key_id}.pem" -r apps/weaviate-no-restart-on-crash/ "$ssh_addr:~/apps/"
scp -i "${key_id}.pem" -r ann_benchmark_quantization.sh "$ssh_addr:~"
ssh -i "${key_id}.pem" $ssh_addr -- "DATASET=$dataset DISTANCE=$distance REQUIRED_RECALL=$REQUIRED_RECALL QUANTIZATION=$QUANTIZATION WEAVIATE_VERSION=$WEAVIATE_VERSION MACHINE_TYPE=$MACHINE_TYPE CLOUD_PROVIDER=$CLOUD_PROVIDER OS=$OS bash ann_benchmark_quantization.sh"
mkdir -p results
scp -i "${key_id}.pem" -r "$ssh_addr:~/results/*.json" results/