biology/vcf-split: Split a multi-sample VCF into single-sample VCFs

Vcf-split splits a multi-sample VCF into single-sample VCFs, writing thousands of output files simultaneously. Parsing the TOPMed human chromosome 1 BCF with bcftools takes two days, so extracting the 137,977 samples one at a time or using thousands of parallel readers of the same file is impractical. Vcf-split solves this by generating thousands of single-sample outputs during a single sweep through the multi-sample input.
author: Jason W. Bacon <jwb@FreeBSD.org> 2021-03-21 15:17:27 +0000
committer: Jason W. Bacon <jwb@FreeBSD.org> 2021-03-21 15:17:27 +0000
commit: 5a899ba8ed1909cc03c8c66eee45bcc82e407ba0 (patch)
tree: b68312e23839f6cc6bd1b1d28243fd764915050b /biology
parent: f28acb9dcb1134e9eed638ce0a5f971d012d9a1d (diff)
download: ports-5a899ba8ed1909cc03c8c66eee45bcc82e407ba0.tar.gz
ports-5a899ba8ed1909cc03c8c66eee45bcc82e407ba0.zip
4 files changed, 35 insertions, 0 deletions
diff --git a/biology/Makefile b/biology/Makefile
index 294c24aa9c6e..cba970a40784 100644
--- a/biology/Makefile
+++ b/biology/Makefile
@@ -179,6 +179,7 @@
     SUBDIR += trimadap
     SUBDIR += trimmomatic
     SUBDIR += ugene
+    SUBDIR += vcf-split
     SUBDIR += vcflib
     SUBDIR += vcftools
     SUBDIR += velvet
diff --git a/biology/vcf-split/Makefile b/biology/vcf-split/Makefile
new file mode 100644
index 000000000000..16a46e71b4a0
--- /dev/null
+++ b/biology/vcf-split/Makefile
@@ -0,0 +1,23 @@
+# $FreeBSD$
+
+PORTNAME=	vcf-split
+DISTVERSION=	0.1.1
+CATEGORIES=	biology
+
+MAINTAINER=	jwb@FreeBSD.org
+COMMENT=	Split a multi-sample VCF into single-sample VCFs
+
+LICENSE=	BSD2CLAUSE
+LICENSE_FILE=	${WRKSRC}/LICENSE
+
+BUILD_DEPENDS=	biolibc>=0.1.1:biology/biolibc
+
+USE_GITHUB=	yes
+GH_ACCOUNT=	auerlab
+
+PLIST_FILES=	bin/vcf-split man/man1/vcf-split.1.gz
+
+pre-build:
+	(cd ${WRKSRC} && ${MAKE} LOCALBASE=${LOCALBASE} depend)
+
+.include <bsd.port.mk>
diff --git a/biology/vcf-split/distinfo b/biology/vcf-split/distinfo
new file mode 100644
index 000000000000..d72992f743f0
--- /dev/null
+++ b/biology/vcf-split/distinfo
@@ -0,0 +1,3 @@
+TIMESTAMP = 1616331493
+SHA256 (auerlab-vcf-split-0.1.1_GH0.tar.gz) = 07fb3aff5bf6038b251baa6c0cbff0600487766838b497468ab06d300488f310
+SIZE (auerlab-vcf-split-0.1.1_GH0.tar.gz) = 14226
diff --git a/biology/vcf-split/pkg-descr b/biology/vcf-split/pkg-descr
new file mode 100644
index 000000000000..689705295fcb
--- /dev/null
+++ b/biology/vcf-split/pkg-descr
@@ -0,0 +1,8 @@
+Vcf-split splits a multi-sample VCF into single-sample VCFs, writing thousands
+of output files simultaneously.  Parsing the TOPMed human chromosome 1 BCF
+with bcftools takes two days, so extracting the 137,977 samples one at a time
+or using thousands of parallel readers of the same file is impractical.
+Vcf-split solves this by generating thousands of single-sample outputs during
+a single sweep through the multi-sample input.
+
+WWW: https://github.com/auerlab/vcf-split
author	Jason W. Bacon <jwb@FreeBSD.org>	2021-03-21 15:17:27 +0000
committer	Jason W. Bacon <jwb@FreeBSD.org>	2021-03-21 15:17:27 +0000
commit	5a899ba8ed1909cc03c8c66eee45bcc82e407ba0 (patch)
tree	b68312e23839f6cc6bd1b1d28243fd764915050b /biology
parent	f28acb9dcb1134e9eed638ce0a5f971d012d9a1d (diff)
download	ports-5a899ba8ed1909cc03c8c66eee45bcc82e407ba0.tar.gz ports-5a899ba8ed1909cc03c8c66eee45bcc82e407ba0.zip