diff --git a/.gitignore b/.gitignore index 6fb1a186882290362e7fd372757924e8f7a2e8c3..4bf5d480aaf308d471ee7370ba7c9e7e00d3fbd9 100644 --- a/.gitignore +++ b/.gitignore @@ -5,4 +5,6 @@ *.Rproj # OS X stuff - https://gist.github.com/adamgit/3786883 .DS_Store -.Trashes \ No newline at end of file +.Trashes +# sensitive files +/howTo/r-with-aws/access_keys/aws_access.R \ No newline at end of file diff --git a/README.md b/README.md index b51a35905430e8600ff44b0eebab52c70fdb9516..084bdd1fe57b4100668986aeed51ffcee20c583e 100644 --- a/README.md +++ b/README.md @@ -11,6 +11,7 @@ This repo does three things: * how to use R/RStudio on the University [SVE (remote desktop) service](howTo/sve.md) * where to [keep your data](howTo/keepingData.md) * how to use [renv](howTo/renv.md) to manage your R environment - including packages + * how to access Amazon Web Services S3 buckets directly from R using [aws.s3](howTo/aws-s3.md) * it is a [template](repoAsATemplate.md) repo that illustrates how we work and which you can copy; * it is an R package. This means: * package functions are kept in /R diff --git a/howTo/aws-s3.md b/howTo/aws-s3.md new file mode 100644 index 0000000000000000000000000000000000000000..be953ad436106191ba38ccfc6cd31f90d20f50d3 --- /dev/null +++ b/howTo/aws-s3.md @@ -0,0 +1,29 @@ +# Guide to accessing data from Amazon Web Services (AWS) S3 buckets using R + +This guide provides details of how to set up and access files/data stored within an AWS S3 bucket directly from an R session using the [aws.s3 package](https://github.com/cloudyr/aws.s3). + +Prerequisite: access to the AWS account where the S3 bucket is located in order to create a user access policy. + +## Creating a user access policy (in the AWS console) + +Following guidance here: https://www.gormanalysis.com/blog/connecting-to-aws-s3-with-r/ + +Create user 'rconnector' ... and create user policy 'test-bucket-connector' (see [example access policy](howTo/r-with-aws/rconnector-access-policy). + +Make sure to save access key ID and secret access key to use with S3 API client. + +Use these details to set the following environment variable (see below for code) and store the credentials in an R script e.g. in your project folder (in this example in a subfolder called [access keys](howTo/r-with-aws/access_keys). Note, for security exclude this file from the project repository by adding to your .gitignore file). The R script will look something like the following ... + +``` +Sys.setenv( + "AWS_ACCESS_KEY_ID" = "mykey", + "AWS_SECRET_ACCESS_KEY" = "mysecretkey", + "AWS_DEFAULT_REGION" = "eu-west-2" +) +``` + +An example script can be found [here](howTo/r-with-aws/access_keys/example_credentials_script.R). + +## Connecting to the S3 bucket with R + +You're ready to go! See [example code](howTo/r-with-aws/using_aws-s3_example.R) showing some commands to authenticate R with AWS and read and write files from/to AWS S3 buckets. diff --git a/howTo/r-with-aws/access_keys/example_credentials_script.R b/howTo/r-with-aws/access_keys/example_credentials_script.R new file mode 100644 index 0000000000000000000000000000000000000000..19a90c6ae5a9c8b61bce4ffe87abc7a7adcdf5df --- /dev/null +++ b/howTo/r-with-aws/access_keys/example_credentials_script.R @@ -0,0 +1,8 @@ +# Set environment variables to authenticate access to AWS S3 bucket +# Use in conjunction with aws.s3 package + +Sys.setenv( + "AWS_ACCESS_KEY_ID" = "mykey", + "AWS_SECRET_ACCESS_KEY" = "mysecretkey", + "AWS_DEFAULT_REGION" = "eu-west-2" +) \ No newline at end of file diff --git a/howTo/r-with-aws/bucket-policy b/howTo/r-with-aws/bucket-policy new file mode 100644 index 0000000000000000000000000000000000000000..1f9a74cfe80da8f47d1fbf70bed4ee83262fc5bf --- /dev/null +++ b/howTo/r-with-aws/bucket-policy @@ -0,0 +1,33 @@ +{ + "Version": "2012-10-17", + "Id": "PolicyForDestinationBucket", + "Statement": [ + { + "Sid": "Permissions on objects and buckets", + "Effect": "Allow", + "Principal": { + "AWS": "arn:aws:iam::000000000000:role/cross-account-bucket-replication-role" + }, + "Action": [ + "s3:List*", + "s3:GetBucketVersioning", + "s3:PutBucketVersioning", + "s3:ReplicateDelete", + "s3:ReplicateObject" + ], + "Resource": [ + "arn:aws:s3:::my-s3-bucket-name", + "arn:aws:s3:::my-s3-bucket-name/*" + ] + }, + { + "Sid": "Permission to override bucket owner", + "Effect": "Allow", + "Principal": { + "AWS": "arn:aws:iam::999999999999:root" + }, + "Action": "s3:ObjectOwnerOverrideToBucketOwner", + "Resource": "arn:aws:s3:::my-s3-bucket-name/*" + } + ] +} \ No newline at end of file diff --git a/howTo/r-with-aws/rconnector-access-policy b/howTo/r-with-aws/rconnector-access-policy new file mode 100644 index 0000000000000000000000000000000000000000..c44a7059b9f9464710478b9ad9abeb6b04c5fba5 --- /dev/null +++ b/howTo/r-with-aws/rconnector-access-policy @@ -0,0 +1,118 @@ +{ + "Version": "2012-10-17", + "Statement": [ + { + "Sid": "VisualEditor0", + "Effect": "Allow", + "Action": [ + "s3:PutAnalyticsConfiguration", + "s3:GetObjectVersionTagging", + "s3:DeleteAccessPoint", + "s3:CreateBucket", + "s3:ReplicateObject", + "s3:GetObjectAcl", + "s3:GetBucketObjectLockConfiguration", + "s3:DeleteBucketWebsite", + "s3:GetIntelligentTieringConfiguration", + "s3:DeleteJobTagging", + "s3:PutLifecycleConfiguration", + "s3:GetObjectVersionAcl", + "s3:PutObjectTagging", + "s3:DeleteObject", + "s3:DeleteObjectTagging", + "s3:GetBucketPolicyStatus", + "s3:GetObjectRetention", + "s3:GetBucketWebsite", + "s3:GetJobTagging", + "s3:PutReplicationConfiguration", + "s3:GetObjectAttributes", + "s3:DeleteObjectVersionTagging", + "s3:PutObjectLegalHold", + "s3:InitiateReplication", + "s3:GetObjectLegalHold", + "s3:GetBucketNotification", + "s3:PutBucketCORS", + "s3:GetReplicationConfiguration", + "s3:ListMultipartUploadParts", + "s3:PutObject", + "s3:GetObject", + "s3:PutBucketNotification", + "s3:DescribeJob", + "s3:PutBucketLogging", + "s3:GetAnalyticsConfiguration", + "s3:PutBucketObjectLockConfiguration", + "s3:GetObjectVersionForReplication", + "s3:CreateAccessPoint", + "s3:GetLifecycleConfiguration", + "s3:GetInventoryConfiguration", + "s3:GetBucketTagging", + "s3:PutAccelerateConfiguration", + "s3:DeleteObjectVersion", + "s3:GetBucketLogging", + "s3:ListBucketVersions", + "s3:ReplicateTags", + "s3:RestoreObject", + "s3:ListBucket", + "s3:GetAccelerateConfiguration", + "s3:GetObjectVersionAttributes", + "s3:GetBucketPolicy", + "s3:PutEncryptionConfiguration", + "s3:GetEncryptionConfiguration", + "s3:GetObjectVersionTorrent", + "s3:AbortMultipartUpload", + "s3:PutBucketTagging", + "s3:GetBucketRequestPayment", + "s3:GetAccessPointPolicyStatus", + "s3:UpdateJobPriority", + "s3:GetObjectTagging", + "s3:GetMetricsConfiguration", + "s3:GetBucketOwnershipControls", + "s3:DeleteBucket", + "s3:PutBucketVersioning", + "s3:GetBucketPublicAccessBlock", + "s3:ListBucketMultipartUploads", + "s3:PutIntelligentTieringConfiguration", + "s3:PutMetricsConfiguration", + "s3:PutBucketOwnershipControls", + "s3:PutObjectVersionTagging", + "s3:PutJobTagging", + "s3:UpdateJobStatus", + "s3:GetBucketVersioning", + "s3:GetBucketAcl", + "s3:PutInventoryConfiguration", + "s3:GetObjectTorrent", + "s3:PutBucketWebsite", + "s3:PutBucketRequestPayment", + "s3:PutObjectRetention", + "s3:GetBucketCORS", + "s3:GetBucketLocation", + "s3:GetAccessPointPolicy", + "s3:ReplicateDelete", + "s3:GetObjectVersion" + ], + "Resource": [ + "arn:aws:s3:::my-aws-bucket", + "arn:aws:s3:*:999999999999:accesspoint/*", + "arn:aws:s3:::my-aws-bucket/*", + "arn:aws:s3:*:999999999999:job/*" + ] + }, + { + "Sid": "VisualEditor1", + "Effect": "Allow", + "Action": [ + "s3:ListStorageLensConfigurations", + "s3:ListAccessPointsForObjectLambda", + "s3:GetAccessPoint", + "s3:GetAccountPublicAccessBlock", + "s3:ListAllMyBuckets", + "s3:ListAccessPoints", + "s3:ListJobs", + "s3:PutStorageLensConfiguration", + "s3:ListMultiRegionAccessPoints", + "s3:CreateJob" + ], + "Resource": "*" + } + ] +} \ No newline at end of file diff --git a/howTo/r-with-aws/using_aws-s3_example.R b/howTo/r-with-aws/using_aws-s3_example.R new file mode 100644 index 0000000000000000000000000000000000000000..432c69993c5a81862ee33942a55f414a190f0f0d --- /dev/null +++ b/howTo/r-with-aws/using_aws-s3_example.R @@ -0,0 +1,48 @@ +# Requires aws.s3 package install if required +# install.packages("aws.s3") + +# Set environment variables to use AWS access keys +source("./howTo/r-with-aws/access_keys/aws_access.R") # Replace with your credentials e.g. next line +# source("./howTo/r-with-aws/access_keys/example_credentials_script.R") + +# Get list of buckets +aws.s3::bucketlist() + +# set bucket name (less typing) - this is the name of your s3 bucket +my_bucket <- "twr-test-bucket-r" + +# write a file to temp dir - using a built in data frame +write.csv(iris, file.path(tempdir(), "iris.csv")) + +# save an object (file from the temp dir) to the bucket +aws.s3::put_object( + file = file.path(tempdir(), "iris.csv"), + object = "iris.csv", + bucket = my_bucket +) + +# list objects in the bucket +aws.s3::get_bucket( + bucket = my_bucket +) + +# provide a nice table of objects in the bucket +data.table::rbindlist(aws.s3::get_bucket(bucket = my_bucket)) + +# read an object from s3 bucket, three ways ... + +# 1. bucket and object specified separately +aws.s3::s3read_using( + FUN = read.csv, bucket = my_bucket, object = "iris.csv" + ) + +# 2. use the s3 URI +aws.s3::s3read_using( + FUN = read.csv, object = "s3://twr-test-bucket-r/iris.csv" + ) + +# 3. use data.table's fread() function for fast CSV reading +aws.s3::s3read_using( + FUN = data.table::fread, object = "s3://twr-test-bucket-r/iris.csv" + ) +