Merge branch 'WorkingUpdate' into 'master'

Working update See merge request rjp2g12/LDMAP!1

Merge branch 'WorkingUpdate' into 'master'
949ab8a9 · Reuben Pengelly · 0ff3bfa2 · f495d738 · 949ab8a9 · 0ff3bfa2
Commit 949ab8a9 authored 4 years ago by Reuben Pengelly
--- a/ACCESSORY_SCRIPTS/contig_stitch.pl
+++ b/ACCESSORY_SCRIPTS/contig_stitch.pl
+#!perl;
+use strict;
+use warnings;
+
+## Takes file list of split LDMAP outputs, merges contigs together, averaging along the overlapping segment. Must use CSV input.
+## Calling input "perl contig_stitch.pl {list file} {overlap} {trim}"
+
+
+
+## Read input and validate, prepare files.
+my $overlap_max = $ARGV[1]; my $trim = $ARGV[2];
+if (((defined $overlap_max) and ($overlap_max >= 0)) and ((defined $trim) and ($trim > 0))) {} else {die "\nEnsure all required variables are supplied on calling.\nSource file, overlap and desired trim distance must be supplied, in that order.\n\n";}
+if (($trim * 2) > $overlap_max) {
+	die "\nTrim length is greater than the length of overlap_max, would create holes in map!\nTrim is removed from both contig ends on joining.\nCheck input variables.\n\n";
+}
+open LIST, $ARGV[0] or die "\nUnable to open file containing list.\n";
+chomp (my @file_list = <LIST>);
+close LIST;
+	print "\nRunning on: @file_list\n\n";
+print "Settings are: overlap max = $overlap_max\; trim from each touching end = $trim.\n\n";
+
+open LOG, '>concat_' . "$ARGV[0]" . '.log';
+print LOG "\nRunning on: @file_list\n";
+print LOG "Settings are: overlap max = $overlap_max\; trim from each touching end = $trim.\n\n";
+
+open OUT, '>concat_' . "$ARGV[0]" . '.ter.csv';
+close OUT;
+
+
+## Loop through the file list and concatenate.
+my $file1_name; my $file2_name; my $iter = 0;
+while (@file_list) {
+	## Declare files to be opened for merging
+	if ($iter == 0) {
+		$file1_name = (shift @file_list);
+	}
+	$file2_name = (shift @file_list);
+	
+	## Open first 2 files and discard headers
+	open FILE1, $file1_name or die "\n cannot open $file1_name.";
+	chomp (my @file1 = <FILE1>);
+	close FILE1;
+	if ($file1[0] !~ /,/) {die "Input data must be in CSV format\n";}
+	if ($file1[0] =~ /^N/) {shift @file1;}
+	
+	open FILE2, $file2_name or die "\n cannot open $file2_name.";
+	chomp (my @file2 = <FILE2>);
+	close FILE2;
+	shift @file2;
+	if ($file2[0] =~ /^N/) {shift @file2;}
+
+	## Count number of shared entries in both arrays to calculate the overlap.
+	## Hash based will be quicker, may need to implement in the future.
+	my $cnt = 0;
+	
+	## Hash try
+	my %file2_temp;
+	foreach (@file2) {
+		my @split = split /,/, $_;
+		$file2_temp{$split[1]} = $_;
+	}
+	foreach my $line_1 (@file1) {
+		my @split_line_1 = split /,/, $line_1;
+		if ( exists $file2_temp{$split_line_1[1]}  ) {
+			$cnt ++;
+		}
+	}
+	undef %file2_temp; 
+		
+	my $overlap = $cnt;
+	print LOG "$file1_name\t$file2_name\t$overlap entry overlap\n";
+	if ($overlap > $overlap_max) {die "Overlap has exceeded defined max of $overlap_max between $file1_name & $file2_name.\n\n";};
+
+	## Define LDU prior to trim point in file 2
+	my @temp = split /,/, $file2[($trim-1)]; 
+	my $file2_start_tide = $temp[3];
+	undef @temp;
+	
+	## Trim desired length off arrays
+	if ($trim > 0) {
+		@file1 = reverse @file1;
+		splice @file1, 0, $trim;
+		@file1 = reverse @file1;
+		splice @file2, 0, $trim; ## Removes from start, counting forwards.
+	}
+	
+	## Count remaining overlapping entries after trim
+	$cnt = 0;
+	foreach (@file2) {
+		my @split = split /,/, $_;
+		$file2_temp{$split[1]} = $_;
+	}
+	foreach my $line_1 (@file1) {
+		my @split_line_1 = split /,/, $line_1;
+		if ( exists $file2_temp{$split_line_1[1]}  ) {
+			$cnt ++;
+		}
+	}
+	undef %file2_temp;
+	$overlap = $cnt;
+	
+	@file1 = reverse @file1;
+	my @over1 = splice @file1, 0, $overlap;
+	@over1 = reverse @over1;
+	@file1 = reverse @file1;
+	my @over2 = splice @file2, 0, $overlap; ## Removes from start, counting forwards.
+	
+	## Define value for end of overlap for subtraction later to narmalise with the meaned value
+	@temp = split /,/, $over2[-1];
+	my $over2_end_tide = $temp[3];
+	undef @temp;
+	
+	## Define LDUvalue for starting meaned contig
+	@temp = split /,/, $file1[-1];
+	my $tide1 = $temp[3];
+	undef @temp;
+	
+	
+	## Average values within overlap
+	## Unpaired values will be added to array too, without alteration. Warning will be printed.
+	my @meaned;
+	foreach my $line1 (@over1) {
+		my @loc1 = split /,/, $line1;
+		my $index = 0; my $found = 0;
+		foreach my $line2 (@over2) {
+			my @loc2 = split /,/, $line2;
+			if ($loc1[1] eq $loc2[1]) {
+				$found++;
+				$loc1[3] = (($loc1[3] + ($loc2[3] + $tide1 - $file2_start_tide)) / 2);
+				$line1 = join ',', @loc1,'aved';
+				push @meaned, $line1;
+				splice @over2, $index, 1;
+				last;
+			}
+			$index++;
+		}
+		if ($found == 0) { 
+			print STDERR "No pair found for $line1\n";
+			push @meaned, $line1;
+		}
+	}
+	foreach my $line2 (@over2) {
+		#print STDERR "No pair found for $line2\n";
+		my @loc2 = split /,/, $line2;
+		$loc2[3] = ($loc2[3] + $tide1 - $file2_start_tide);
+		$line2 = join ',', @loc2;
+		push @meaned, $line2;
+	}
+	@temp = split /,/, $meaned[-1];
+	my $tide2 = $temp[3];
+	undef @temp;
+	
+	## Print values to final file
+	open OUT, '>concat_' . "$ARGV[0]" . '.ter.csv';
+	print OUT "N,Locus,KBmap,LDUmap,aved?\n";
+	
+		foreach (@file1) {
+			print OUT "$_\n";
+		}
+	foreach (@meaned) {
+		print OUT "$_\n";
+	}
+	
+	
+	foreach (@file2) {
+		my @loc = split /,/, $_;
+		$loc[3] += ($tide2 - $over2_end_tide);
+		my $str = join ',', @loc;
+		print OUT "$str\n";
+	}
+	close OUT;
+	
+	## Define variables for next loop
+	$file1_name = 'concat_' . "$ARGV[0]" . '.ter.csv';	
+	$iter ++;
+}
+
+## Rejoice in final success!
+print "File merging complete!\n";
+
+__END__
+Takes file list of split LDMAP outputs, merges contigs together, averaging along the overlapping segment. Must use CSV input.
+Input file list must be in correct order, errors in this should be fairly easy to spot on a discontinuous line on the resultant plot
+Input must have File, overlap of existing contigs, and desired trim distance.
+Trimming can be used as the quality of mapping will be greatly reduced at the end of segments, should also work to assemble the non-overlapping 'contig-free' approach, see WLau thesis.
+Trim is removed from overlap region!! This does require some pre-emptive mental arithmetic.
+
+Awkward to merge the two contigs as the overlap region that was defined for the VCF splitting is not necessarily the same as that which remains in the LDMAP output.
+LDMAP software does further QC check on loci, which can reduce the number of loci from those entered.
--- a/KEEPfileinputnew
+++ b/KEEPfileinputnew
--- a/README.md
+++ b/README.md
 # LDMAP

-Contact: [R.J.Pengelly@soton.ac.uk](mailto:r.j.pengelly@soton.ac.uk) or [A.R.Collins@soton.ac.uk](mailto:a.r.collins@soton.ac.uk)
+Contact: [R.J.Pengelly@soton.ac.uk](mailto:r.j.pengelly@soton.ac.uk) or [A.R.Collins@soton.ac.uk](mailto:a.r.collins@soton.ac.uk).

 Overview
 ========

--- a/allass.h
+++ b/allass.h
-
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <math.h>
 #include <time.h>
-#define MAX_LOCI 100000
+#define MAX_LOCI 1000000
 #define MAX_LINE 50000 
 #define IA 16807
 #define IM 2147483647
@@ -159,7 +158,7 @@ double g_df,g_V,g_ekeep,g_mkeep;
 /********************/
 char g_temp[MAX_LINE];
 char gg_temp[500];
-char terfile[20],intefile[20],datfile[20];
+char terfile[50],intefile[50],datfile[50];
 char g_loci[MAX_LOCI][20];
 char g_allloci[MAX_LOCI][20];


--- a/ldmapper.c
+++ b/ldmapper.c
@@ -2,12 +2,12 @@
 #include "allass.h"
 int main(int argc, char *argv[])
 {
-char outputfile[20],jobfile[20], temp1[20],temp2[20];
+char outputfile[50],jobfile[20], temp1[20],temp2[20];
 /*This is the maximum number of intervals used:*/
 g_int=100;
 /*This is the maximum distance between pairs of SNPs used:*/
 g_max=500.;
-/*This is the Hardy-Wienberg cut off */
+/*This is the Hardy-Weinberg cut off */
 g_hwp=0.001;
 /*This is the MAF cut off */
 g_maf=0.05;
@@ -15,6 +15,12 @@ g_maf=0.05;
 g_finish=0;
 g_calls=0;

+if(argc != 8)
+	{
+	fprintf(stderr, "\nIncorrect number of options given \n\nExpected usage: ./ldmapper1 [input.tped] [intermediate.txt] [job] [out.ldmap] [out.log] [MAF] [HWE]\n\n");
+	return 1;
+	}
+
 strcpy(datfile,argv[1]);strcpy(intefile,argv[2]); strcpy(jobfile,argv[3]);strcpy(terfile,argv[4]);strcpy(outputfile,argv[5]);
 strcpy(temp1,argv[6]);strcpy(temp2,argv[7]);


--- a/makefile
+++ b/makefile
@@ -6,7 +6,7 @@ SOURCES= \
 PROGRAM= ldmapper1 

 # gcc can take "-Wall -Werror" for checking
-CFLAGS= -g -O -Wall -Werror
+CFLAGS= -g -O
 CPPFLAGS= $(INCLUDEDIRS)
 LDFLAGS=
 LINTFLAGS=