diff --git a/R/create_LDprofile.R b/R/create_LDprofile.R index 7fbc4f3c2fd2c0b1061d96fe29bb31850471c5c4..73c1bf15902bdbdd8e8f8d9dd6409ce1bfe20a04 100644 --- a/R/create_LDprofile.R +++ b/R/create_LDprofile.R @@ -1,14 +1,17 @@ #' Creates an LD profile #' -#' An LD (linkage disequilibrium) profile is a look-up table that tell you the expected correlation between SNPs given the genetic distance between them. +#' An LD (linkage disequilibrium) profile is a look-up table containing the expected correlation between SNPs given the genetic distance between them. +#' +#' The input for \code{dist} and \code{x} can be lists. This is to allow multiple datasets to be used in the creation of the LD profile. For example, using all 22 autosomes from the human genome would involve 22 different distance vectors and SNP matrices. +#' Both lists should be the same length and should correspond exactly to eachother (i.e. the distances in each element of dist should go with the SNPs in the same element of x) #' #' In the output, bins represent lower bounds. The first bin contains pairs where the genetic distance is greater than or equal to 0 and less than \code{bin_size}. The final bin contains pairs where the genetic distance is greater than or equal to \code{max_dist}-\code{bin_size} and less than \code{max_dist}. #' If the \code{max_dist} is not an increment of \code{bin_size}, it will be adjusted to the next highest increment.The maximum bin will be the bin that \code{max_dist} falls into. For example, if the \code{max_dist} is given as 4.5 and the \code{bin_size} is 1, the final bin will be 4.\cr #' By default, Beta parameters are not calculated. To calcualte Beta parameters, needed for the \code{\link{Zalpha_BetaCDF}} and \code{\link{Zbeta_BetaCDF}} statistics, \code{beta_params} should be set to TRUE and the package \code{fitdistrplus} must be installed. #' -#' @param dist A numeric vector containing genetic distances. -#' @param x A matrix of SNP values. Columns represent chromosomes; rows are SNP locations. Hence, the number of rows should equal the length of the \code{dist} vector. SNPs should all be biallelic. +#' @param dist A numeric vector, or a list of numeric vectors, containing genetic distances. +#' @param x A matrix of SNP values, or a list of matrices. Columns represent chromosomes; rows are SNP locations. Hence, the number of rows should equal the length of the \code{dist} vector. SNPs should all be biallelic. #' @param bin_size The size of each bin, in the same units as \code{dist}. #' @param max_dist Optional. The maximum genetic distance to be considered. If this is not supplied, it will default to the maximum distance in the \code{dist} vector. #' @param beta_params Optional. Beta parameters are calculated if this is set to TRUE. Default is FALSE. @@ -30,27 +33,47 @@ #' @seealso \code{\link{Zalpha_expected}} \code{\link{Zalpha_rsq_over_expected}} \code{\link{Zalpha_log_rsq_over_expected}} \code{\link{Zalpha_Zscore}} \code{\link{Zalpha_BetaCDF}} \code{\link{Zbeta_expected}} \code{\link{Zbeta_rsq_over_expected}} \code{\link{Zbeta_log_rsq_over_expected}} \code{\link{Zbeta_Zscore}} \code{\link{Zbeta_BetaCDF}} \code{\link{Zalpha_all}} #' create_LDprofile<-function(dist,x,bin_size,max_dist=NULL,beta_params=FALSE){ - #Checks - #Check dist is vector - if (is.numeric(dist) ==FALSE || is.vector(dist)==FALSE){ - stop("dist must be a numeric vector") - } - #Check x is a matrix - if (is.matrix(x)==FALSE){ - stop("x must be a matrix") + + #Changes dist into a list if it is not one already + if (is.list(dist)==FALSE){ + dist<-list(dist) } - #Check x has rows equal to the length of dist - if (length(dist) != nrow(x)){ - stop("The number of rows in x must equal the number of SNP genetic distances given in dist") + + #Changes x into a list if it is not one already + if (is.list(x)==FALSE){ + x<-list(x) } - #Check SNPs are all biallelic - if (sum(apply(x,1,function(x){length(na.omit(unique(x)))}) != 2)>0){ - stop("SNPs must all be biallelic") + + #Checks + #Check the dist list and the x list have the same length + if (length(dist)!=length(x)){ + stop("dist and x should contain the same number of elements") } - #Change matrix x to numeric if it isn't already - if (is.numeric(x)==FALSE){ - x<-matrix(as.numeric(factor(x)),nrow=dim(x)[1]) + + #Check for each element in dist and x + for (el in 1:length(dist)){ + #Check dist is vector + if (is.numeric(dist[[el]]) ==FALSE || is.vector(dist[[el]])==FALSE){ + stop("dist must be a numeric vector or list of numeric vectors") + } + #Check x is a matrix + if (is.matrix(x[[el]])==FALSE){ + stop("x must be a matrix or list of matrices") + } + #Check x has rows equal to the length of dist + if (length(dist[[el]]) != nrow(x[[el]])){ + stop("The number of rows in x must equal the number of SNP genetic distances given in the corresponding dist") + } + #Check SNPs are all biallelic + if (sum(apply(x[[el]],1,function(x){length(na.omit(unique(x)))}) != 2)>0){ + stop("SNPs must all be biallelic") + } + #Change matrix x to numeric if it isn't already + if (is.numeric(x[[el]])==FALSE){ + x[[el]]<-matrix(as.numeric(factor(x[[el]])),nrow=dim(x[[el]])[1]) + } } + #Check bin_size is a number if (is.numeric(bin_size) ==FALSE || bin_size <= 0){ stop("bin_size must be a number greater than 0") @@ -62,7 +85,7 @@ create_LDprofile<-function(dist,x,bin_size,max_dist=NULL,beta_params=FALSE){ } } else { #Set max_dist to the maximum distance in the data if it was not supplied - max_dist<-dist[length(dist)]-dist[1] + max_dist<-max(sapply(dist,function(x){x[length(x)]-x[1]}),na.rm = TRUE) } #Adjusts the Max_dist value so it is equal to an increment of bin_size if it isn't already if(!isTRUE(all.equal(max_dist,assign_bins(bin_size,max_dist)))){ @@ -79,12 +102,16 @@ create_LDprofile<-function(dist,x,bin_size,max_dist=NULL,beta_params=FALSE){ } } - #Find the differences in genetic distances between pairs of SNPs - diffs<-lower_triangle(outer(dist,dist,"-")) - - #Find the rsquared value between pairs of SNPs - rsq<-lower_triangle(cor(t(x),use="pairwise.complete.obs")^2) + diffs<-NULL + rsq<-NULL + #for each element in dist and x, get the differences and rsquared values + for (el in 1:length(dist)){ + #Find the differences in genetic distances between pairs of SNPs + diffs<-c(diffs,lower_triangle(outer(dist[[el]],dist[[el]],"-"))) + #Find the rsquared value between pairs of SNPs + rsq<-c(rsq,lower_triangle(cor(t(x[[el]]),use="pairwise.complete.obs")^2)) + } #Filter for just those less than the max genetic distance and filter out missing distances rsq<-rsq[diffs<max_dist & is.na(diffs)==FALSE] diffs<-diffs[diffs<max_dist & is.na(diffs)==FALSE] diff --git a/man/create_LDprofile.Rd b/man/create_LDprofile.Rd index 30a62b8904dfe9207a2f717adf48b338d6f56077..c94925ebdfc390836c244453764deb9675417206 100644 --- a/man/create_LDprofile.Rd +++ b/man/create_LDprofile.Rd @@ -7,9 +7,9 @@ create_LDprofile(dist, x, bin_size, max_dist = NULL, beta_params = FALSE) } \arguments{ -\item{dist}{A numeric vector containing genetic distances.} +\item{dist}{A numeric vector, or a list of numeric vectors, containing genetic distances.} -\item{x}{A matrix of SNP values. Columns represent chromosomes; rows are SNP locations. Hence, the number of rows should equal the length of the \code{dist} vector. SNPs should all be biallelic.} +\item{x}{A matrix of SNP values, or a list of matrices. Columns represent chromosomes; rows are SNP locations. Hence, the number of rows should equal the length of the \code{dist} vector. SNPs should all be biallelic.} \item{bin_size}{The size of each bin, in the same units as \code{dist}.} @@ -21,9 +21,12 @@ create_LDprofile(dist, x, bin_size, max_dist = NULL, beta_params = FALSE) A data frame containing an LD profile that can be used by other statistics in this package. } \description{ -An LD (linkage disequilibrium) profile is a look-up table that tell you the expected correlation between SNPs given the genetic distance between them. +An LD (linkage disequilibrium) profile is a look-up table containing the expected correlation between SNPs given the genetic distance between them. } \details{ +The input for \code{dist} and \code{x} can be lists. This is to allow multiple datasets to be used in the creation of the LD profile. For example, using all 22 autosomes from the human genome would involve 22 different distance vectors and SNP matrices. +Both lists should be the same length and should correspond exactly to eachother (i.e. the distances in each element of dist should go with the SNPs in the same element of x) + In the output, bins represent lower bounds. The first bin contains pairs where the genetic distance is greater than or equal to 0 and less than \code{bin_size}. The final bin contains pairs where the genetic distance is greater than or equal to \code{max_dist}-\code{bin_size} and less than \code{max_dist}. If the \code{max_dist} is not an increment of \code{bin_size}, it will be adjusted to the next highest increment.The maximum bin will be the bin that \code{max_dist} falls into. For example, if the \code{max_dist} is given as 4.5 and the \code{bin_size} is 1, the final bin will be 4.\cr By default, Beta parameters are not calculated. To calcualte Beta parameters, needed for the \code{\link{Zalpha_BetaCDF}} and \code{\link{Zbeta_BetaCDF}} statistics, \code{beta_params} should be set to TRUE and the package \code{fitdistrplus} must be installed. diff --git a/tests/testthat/test-create_LDprofile.R b/tests/testthat/test-create_LDprofile.R index 416b5174ec831e2604980fa963d4c946ac232faf..7bb00ad9160598e9731d7a1204bb68f5f06b6cc2 100644 --- a/tests/testthat/test-create_LDprofile.R +++ b/tests/testthat/test-create_LDprofile.R @@ -109,15 +109,15 @@ test_that("create_LDprofile calculates the LD profile correctly with character m test_that("create_LDprofile fails when dist is non-numeric", { expect_error(create_LDprofile(dist = paste0(df$dist,"dist"), x = as.matrix(df[,3:7]), bin_size = 0.001, max_dist = 0.005, beta_params = TRUE), - "dist must be a numeric vector") + "dist must be a numeric vector or list of numeric vectors") }) -## Test the function with dists not a matrix +## Test the function with dists not a vector -test_that("create_LDprofile fails when dist is not a matrix", { +test_that("create_LDprofile fails when dist is not a vector", { - expect_error(create_LDprofile(dist = df, x = as.matrix(df[,3:7]), bin_size = 0.001, max_dist = 0.005, beta_params = TRUE), - "dist must be a numeric vector") + expect_error(create_LDprofile(dist = as.matrix(df$dist), x = as.matrix(df[,3:7]), bin_size = 0.001, max_dist = 0.005, beta_params = TRUE), + "dist must be a numeric vector or list of numeric vectors") }) @@ -125,8 +125,8 @@ test_that("create_LDprofile fails when dist is not a matrix", { test_that("create_LDprofile fails when x is not a matrix", { - expect_error(create_LDprofile(dist = df$dist, x = df[,3:7], bin_size = 0.001, max_dist = 0.005, beta_params = TRUE), - "x must be a matrix") + expect_error(create_LDprofile(dist = df$dist, x = df$C1, bin_size = 0.001, max_dist = 0.005, beta_params = TRUE), + "x must be a matrix or list of matrices") }) ## Test the function with x not having the correct amount of rows @@ -134,7 +134,7 @@ test_that("create_LDprofile fails when x is not a matrix", { test_that("create_LDprofile fails when the number of rows in x is not equal to the length of pos", { expect_error(create_LDprofile(dist = df$dist, x = t(as.matrix(df[,3:7])), bin_size = 0.001, max_dist = 0.005, beta_params = TRUE), - "The number of rows in x must equal the number of SNP genetic distances given in dist") + "The number of rows in x must equal the number of SNP genetic distances given in the corresponding dist") }) ## Test the function with a SNP having only one allele