From 92c435f3c6b7c9261bc076ff8769fdc0ea3d2a33 Mon Sep 17 00:00:00 2001 From: Tom Rushby <t.w.rushby@soton.ac.uk> Date: Sun, 21 Aug 2022 16:20:44 +0100 Subject: [PATCH] Add how-to for accessing AWS S3 bucket from within R - initial committ. --- howTo/aws-s3.md | 29 +++++ .../access_keys/example_credentials_script.R | 8 ++ howTo/r-with-aws/bucket-policy | 33 +++++ howTo/r-with-aws/rconnector-access-policy | 118 ++++++++++++++++++ howTo/r-with-aws/using_aws-s3_example.R | 48 +++++++ 5 files changed, 236 insertions(+) create mode 100644 howTo/aws-s3.md create mode 100644 howTo/r-with-aws/access_keys/example_credentials_script.R create mode 100644 howTo/r-with-aws/bucket-policy create mode 100644 howTo/r-with-aws/rconnector-access-policy create mode 100644 howTo/r-with-aws/using_aws-s3_example.R diff --git a/howTo/aws-s3.md b/howTo/aws-s3.md new file mode 100644 index 0000000..be953ad --- /dev/null +++ b/howTo/aws-s3.md @@ -0,0 +1,29 @@ +# Guide to accessing data from Amazon Web Services (AWS) S3 buckets using R + +This guide provides details of how to set up and access files/data stored within an AWS S3 bucket directly from an R session using the [aws.s3 package](https://github.com/cloudyr/aws.s3). + +Prerequisite: access to the AWS account where the S3 bucket is located in order to create a user access policy. + +## Creating a user access policy (in the AWS console) + +Following guidance here: https://www.gormanalysis.com/blog/connecting-to-aws-s3-with-r/ + +Create user 'rconnector' ... and create user policy 'test-bucket-connector' (see [example access policy](howTo/r-with-aws/rconnector-access-policy). + +Make sure to save access key ID and secret access key to use with S3 API client. + +Use these details to set the following environment variable (see below for code) and store the credentials in an R script e.g. in your project folder (in this example in a subfolder called [access keys](howTo/r-with-aws/access_keys). Note, for security exclude this file from the project repository by adding to your .gitignore file). The R script will look something like the following ... + +``` +Sys.setenv( + "AWS_ACCESS_KEY_ID" = "mykey", + "AWS_SECRET_ACCESS_KEY" = "mysecretkey", + "AWS_DEFAULT_REGION" = "eu-west-2" +) +``` + +An example script can be found [here](howTo/r-with-aws/access_keys/example_credentials_script.R). + +## Connecting to the S3 bucket with R + +You're ready to go! See [example code](howTo/r-with-aws/using_aws-s3_example.R) showing some commands to authenticate R with AWS and read and write files from/to AWS S3 buckets. diff --git a/howTo/r-with-aws/access_keys/example_credentials_script.R b/howTo/r-with-aws/access_keys/example_credentials_script.R new file mode 100644 index 0000000..19a90c6 --- /dev/null +++ b/howTo/r-with-aws/access_keys/example_credentials_script.R @@ -0,0 +1,8 @@ +# Set environment variables to authenticate access to AWS S3 bucket +# Use in conjunction with aws.s3 package + +Sys.setenv( + "AWS_ACCESS_KEY_ID" = "mykey", + "AWS_SECRET_ACCESS_KEY" = "mysecretkey", + "AWS_DEFAULT_REGION" = "eu-west-2" +) \ No newline at end of file diff --git a/howTo/r-with-aws/bucket-policy b/howTo/r-with-aws/bucket-policy new file mode 100644 index 0000000..1f9a74c --- /dev/null +++ b/howTo/r-with-aws/bucket-policy @@ -0,0 +1,33 @@ +{ + "Version": "2012-10-17", + "Id": "PolicyForDestinationBucket", + "Statement": [ + { + "Sid": "Permissions on objects and buckets", + "Effect": "Allow", + "Principal": { + "AWS": "arn:aws:iam::000000000000:role/cross-account-bucket-replication-role" + }, + "Action": [ + "s3:List*", + "s3:GetBucketVersioning", + "s3:PutBucketVersioning", + "s3:ReplicateDelete", + "s3:ReplicateObject" + ], + "Resource": [ + "arn:aws:s3:::my-s3-bucket-name", + "arn:aws:s3:::my-s3-bucket-name/*" + ] + }, + { + "Sid": "Permission to override bucket owner", + "Effect": "Allow", + "Principal": { + "AWS": "arn:aws:iam::999999999999:root" + }, + "Action": "s3:ObjectOwnerOverrideToBucketOwner", + "Resource": "arn:aws:s3:::my-s3-bucket-name/*" + } + ] +} \ No newline at end of file diff --git a/howTo/r-with-aws/rconnector-access-policy b/howTo/r-with-aws/rconnector-access-policy new file mode 100644 index 0000000..c44a705 --- /dev/null +++ b/howTo/r-with-aws/rconnector-access-policy @@ -0,0 +1,118 @@ +{ + "Version": "2012-10-17", + "Statement": [ + { + "Sid": "VisualEditor0", + "Effect": "Allow", + "Action": [ + "s3:PutAnalyticsConfiguration", + "s3:GetObjectVersionTagging", + "s3:DeleteAccessPoint", + "s3:CreateBucket", + "s3:ReplicateObject", + "s3:GetObjectAcl", + "s3:GetBucketObjectLockConfiguration", + "s3:DeleteBucketWebsite", + "s3:GetIntelligentTieringConfiguration", + "s3:DeleteJobTagging", + "s3:PutLifecycleConfiguration", + "s3:GetObjectVersionAcl", + "s3:PutObjectTagging", + "s3:DeleteObject", + "s3:DeleteObjectTagging", + "s3:GetBucketPolicyStatus", + "s3:GetObjectRetention", + "s3:GetBucketWebsite", + "s3:GetJobTagging", + "s3:PutReplicationConfiguration", + "s3:GetObjectAttributes", + "s3:DeleteObjectVersionTagging", + "s3:PutObjectLegalHold", + "s3:InitiateReplication", + "s3:GetObjectLegalHold", + "s3:GetBucketNotification", + "s3:PutBucketCORS", + "s3:GetReplicationConfiguration", + "s3:ListMultipartUploadParts", + "s3:PutObject", + "s3:GetObject", + "s3:PutBucketNotification", + "s3:DescribeJob", + "s3:PutBucketLogging", + "s3:GetAnalyticsConfiguration", + "s3:PutBucketObjectLockConfiguration", + "s3:GetObjectVersionForReplication", + "s3:CreateAccessPoint", + "s3:GetLifecycleConfiguration", + "s3:GetInventoryConfiguration", + "s3:GetBucketTagging", + "s3:PutAccelerateConfiguration", + "s3:DeleteObjectVersion", + "s3:GetBucketLogging", + "s3:ListBucketVersions", + "s3:ReplicateTags", + "s3:RestoreObject", + "s3:ListBucket", + "s3:GetAccelerateConfiguration", + "s3:GetObjectVersionAttributes", + "s3:GetBucketPolicy", + "s3:PutEncryptionConfiguration", + "s3:GetEncryptionConfiguration", + "s3:GetObjectVersionTorrent", + "s3:AbortMultipartUpload", + "s3:PutBucketTagging", + "s3:GetBucketRequestPayment", + "s3:GetAccessPointPolicyStatus", + "s3:UpdateJobPriority", + "s3:GetObjectTagging", + "s3:GetMetricsConfiguration", + "s3:GetBucketOwnershipControls", + "s3:DeleteBucket", + "s3:PutBucketVersioning", + "s3:GetBucketPublicAccessBlock", + "s3:ListBucketMultipartUploads", + "s3:PutIntelligentTieringConfiguration", + "s3:PutMetricsConfiguration", + "s3:PutBucketOwnershipControls", + "s3:PutObjectVersionTagging", + "s3:PutJobTagging", + "s3:UpdateJobStatus", + "s3:GetBucketVersioning", + "s3:GetBucketAcl", + "s3:PutInventoryConfiguration", + "s3:GetObjectTorrent", + "s3:PutBucketWebsite", + "s3:PutBucketRequestPayment", + "s3:PutObjectRetention", + "s3:GetBucketCORS", + "s3:GetBucketLocation", + "s3:GetAccessPointPolicy", + "s3:ReplicateDelete", + "s3:GetObjectVersion" + ], + "Resource": [ + "arn:aws:s3:::my-aws-bucket", + "arn:aws:s3:*:999999999999:accesspoint/*", + "arn:aws:s3:::my-aws-bucket/*", + "arn:aws:s3:*:999999999999:job/*" + ] + }, + { + "Sid": "VisualEditor1", + "Effect": "Allow", + "Action": [ + "s3:ListStorageLensConfigurations", + "s3:ListAccessPointsForObjectLambda", + "s3:GetAccessPoint", + "s3:GetAccountPublicAccessBlock", + "s3:ListAllMyBuckets", + "s3:ListAccessPoints", + "s3:ListJobs", + "s3:PutStorageLensConfiguration", + "s3:ListMultiRegionAccessPoints", + "s3:CreateJob" + ], + "Resource": "*" + } + ] +} \ No newline at end of file diff --git a/howTo/r-with-aws/using_aws-s3_example.R b/howTo/r-with-aws/using_aws-s3_example.R new file mode 100644 index 0000000..432c699 --- /dev/null +++ b/howTo/r-with-aws/using_aws-s3_example.R @@ -0,0 +1,48 @@ +# Requires aws.s3 package install if required +# install.packages("aws.s3") + +# Set environment variables to use AWS access keys +source("./howTo/r-with-aws/access_keys/aws_access.R") # Replace with your credentials e.g. next line +# source("./howTo/r-with-aws/access_keys/example_credentials_script.R") + +# Get list of buckets +aws.s3::bucketlist() + +# set bucket name (less typing) - this is the name of your s3 bucket +my_bucket <- "twr-test-bucket-r" + +# write a file to temp dir - using a built in data frame +write.csv(iris, file.path(tempdir(), "iris.csv")) + +# save an object (file from the temp dir) to the bucket +aws.s3::put_object( + file = file.path(tempdir(), "iris.csv"), + object = "iris.csv", + bucket = my_bucket +) + +# list objects in the bucket +aws.s3::get_bucket( + bucket = my_bucket +) + +# provide a nice table of objects in the bucket +data.table::rbindlist(aws.s3::get_bucket(bucket = my_bucket)) + +# read an object from s3 bucket, three ways ... + +# 1. bucket and object specified separately +aws.s3::s3read_using( + FUN = read.csv, bucket = my_bucket, object = "iris.csv" + ) + +# 2. use the s3 URI +aws.s3::s3read_using( + FUN = read.csv, object = "s3://twr-test-bucket-r/iris.csv" + ) + +# 3. use data.table's fread() function for fast CSV reading +aws.s3::s3read_using( + FUN = data.table::fread, object = "s3://twr-test-bucket-r/iris.csv" + ) + -- GitLab