From 5dc2dc0b95cd4e02a676bd137b96d4e9b8ccaaee Mon Sep 17 00:00:00 2001 From: Xiuwen Zheng Date: Thu, 21 Mar 2024 00:19:42 -0500 Subject: [PATCH] seqGetData() --- NEWS | 10 ++- src/GetData.cpp | 143 +++++++++++++++++++++++++++++++++++++++++- src/ReadByVariant.cpp | 74 ++++++++++++++++++---- src/ReadByVariant.h | 6 +- 4 files changed, 217 insertions(+), 16 deletions(-) diff --git a/NEWS b/NEWS index 7d4e638..9bd1fb6 100644 --- a/NEWS +++ b/NEWS @@ -6,7 +6,15 @@ UTILITIES o tweak the display of progress information in `seqVCF2GDS()` o `seqVCF_Header(, getnum=TRUE, verbose=TRUE)` to show the progress - information for scanning the file + information for scanning the VCF file + + o new `seqGetData(, "$dosage_alt2")` and `seqGetData(, "$dosage_sp2")` for + sex chromosomes, when the alleles are partially missing + +BUG FIXES + + o `seqGetData(, "$dosage_alt")` and `seqGetData(, "$dosage_sp")` work + correctly when the ploidy is >2 and there are missing alleles CHANGES IN VERSION 1.42.1 diff --git a/src/GetData.cpp b/src/GetData.cpp index e367de2..27deb28 100755 --- a/src/GetData.cpp +++ b/src/GetData.cpp @@ -2,7 +2,7 @@ // // GetData.cpp: Get data from a SeqArray GDS file // -// Copyright (C) 2015-2022 Xiuwen Zheng +// Copyright (C) 2015-2024 Xiuwen Zheng // // This file is part of SeqArray. // @@ -61,7 +61,9 @@ static const string VAR_PHASE("phase"); // variable list: internally generated static const string VAR_DOSAGE("$dosage"); static const string VAR_DOSAGE_ALT("$dosage_alt"); +static const string VAR_DOSAGE_ALT2("$dosage_alt2"); static const string VAR_DOSAGE_SP("$dosage_sp"); +static const string VAR_DOSAGE_SP2("$dosage_sp2"); static const string VAR_NUM_ALLELE("$num_allele"); static const string VAR_REF_ALLELE("$ref"); static const string VAR_ALT_ALLELE("$alt"); @@ -297,6 +299,40 @@ static SEXP get_dosage_alt(CFileInfo &File, TVarMap &Var, void *param) return rv_ans; } +/// get dosage of alternative allele(s) from 'genotype/data' +static SEXP get_dosage_alt2(CFileInfo &File, TVarMap &Var, void *param) +{ + const TParam *P = (const TParam*)param; + SEXP rv_ans = R_NilValue; + const ssize_t nSample = File.SampleSelNum(); + const ssize_t nVariant = File.VariantSelNum(); + if ((nSample > 0) && (nVariant > 0)) + { + // initialize GDS genotype Node + CApply_Variant_Dosage NodeVar(File, false, true); + if (!get_geno_is_i32(P, NodeVar)) + { + rv_ans = PROTECT(allocMatrix(RAWSXP, nSample, nVariant)); + C_UInt8 *base = (C_UInt8 *)RAW(rv_ans); + do { + NodeVar.ReadDosageAlt_p(base); + base += nSample; + } while (NodeVar.Next()); + } else { + rv_ans = PROTECT(allocMatrix(INTSXP, nSample, nVariant)); + int *base = INTEGER(rv_ans); + do { + NodeVar.ReadDosageAlt_p(base); + base += nSample; + } while (NodeVar.Next()); + } + SET_DIMNAMES(rv_ans, R_Dosage_Name); + UNPROTECT(1); + } + // output + return rv_ans; +} + inline static void check_vector_size(size_t n) { #ifdef COREARRAY_REGISTER_BIT64 @@ -404,6 +440,105 @@ static SEXP get_dosage_sp(CFileInfo &File, TVarMap &Var, void *param) return rv_ans; } +/// get sparse form of dosage of alternative allele(s), allow partial missing +static SEXP get_dosage_sp2(CFileInfo &File, TVarMap &Var, void *param) +{ + SEXP rv_ans = R_NilValue; + const ssize_t nSample = File.SampleSelNum(); + const ssize_t nVariant = File.VariantSelNum(); + if ((nSample > 0) && (nVariant > 0)) + { + // initialize GDS genotype Node + CApply_Variant_Dosage NodeVar(File, false, true); + const bool isI32 = NodeVar.NeedIntType(); + // dgCMatrix@x, @i, @p + SEXP x_r, i_r; + SEXP p_r = PROTECT(NEW_INTEGER(nVariant+1)); + int *p = INTEGER(p_r), i_col; + p[0] = i_col = 0; + // use RAW or integer + if (!isI32 && nSample<16777216) // 2^24 + { + SEXP buffer = PROTECT(NEW_RAW(nSample)); + C_UInt8 *g_buf = (C_UInt8*)RAW(buffer); + // dgCMatrix@i & @x (i << 8 | x) + vector i_x; + i_x.reserve(nSample); + do { + NodeVar.ReadDosageAlt_p(g_buf); + // get # of nonzero + size_t nnzero = vec_i8_count((char *)g_buf, nSample, 0); + i_x.reserve(i_x.size() + nnzero); + // fill i & x + for (ssize_t j=0; j < nSample; j++) + { + C_UInt8 g = g_buf[j]; + if (g != 0) + i_x.push_back((C_UInt32(j) << 8) | g); + } + // update p + p[++i_col] = i_x.size(); + } while (NodeVar.Next()); + UNPROTECT(1); // free buffer + const size_t n = i_x.size(); + check_vector_size(n); + // dgCMatrix@x & @i + x_r = PROTECT(NEW_NUMERIC(n)); + i_r = PROTECT(NEW_INTEGER(n)); + double *x_r_p = REAL(x_r); + int *i_r_p = INTEGER(i_r); + for (size_t i=0; i < n; i++) + { + C_UInt32 v = i_x[i]; + C_UInt8 g = v; + x_r_p[i] = (g != NA_RAW) ? g : NA_REAL; + i_r_p[i] = v >> 8; + } + } else { + SEXP buffer = PROTECT(NEW_INTEGER(nSample)); + int *g_buf = INTEGER(buffer); + // dgCMatrix@i & @x (i << 32 | x) + vector i_x; + i_x.reserve(nSample); + do { + NodeVar.ReadDosageAlt_p(g_buf); + // get # of nonzero + size_t nnzero = vec_i32_count(g_buf, nSample, 0); + i_x.reserve(i_x.size() + nnzero); + // fill i & x + for (ssize_t j=0; j < nSample; j++) + { + int g = g_buf[j]; + if (g != 0) + i_x.push_back((C_UInt64(j) << 32) | C_UInt32(g)); + } + // update p + p[++i_col] = i_x.size(); + } while (NodeVar.Next()); + UNPROTECT(1); // free buffer + const size_t n = i_x.size(); + check_vector_size(n); + // dgCMatrix@x & @i + x_r = PROTECT(NEW_NUMERIC(n)); + i_r = PROTECT(NEW_INTEGER(n)); + double *x_r_p = REAL(x_r); + int *i_r_p = INTEGER(i_r); + for (size_t i=0; i < n; i++) + { + C_UInt64 v = i_x[i]; + int g = v; + x_r_p[i] = (g != NA_INTEGER) ? g : NA_REAL; + i_r_p[i] = v >> 32; + } + } + // new dgCMatrix + rv_ans = GDS_New_SpCMatrix2(x_r, i_r, p_r, nSample, nVariant); + UNPROTECT(3); + } + // output + return rv_ans; +} + /// get the number of alleles for each variant static SEXP get_num_allele(CFileInfo &File, TVarMap &Var, void *param) { @@ -1013,9 +1148,15 @@ COREARRAY_DLL_LOCAL TVarMap &VarGetStruct(CFileInfo &File, const string &name) } else if (name == VAR_DOSAGE_ALT) { vm.Func = get_dosage_alt; + } else if (name == VAR_DOSAGE_ALT2) + { + vm.Func = get_dosage_alt2; } else if (name == VAR_DOSAGE_SP) { vm.Func = get_dosage_sp; + } else if (name == VAR_DOSAGE_SP2) + { + vm.Func = get_dosage_sp2; } else if (name == VAR_NUM_ALLELE) { vm.Init(File, VAR_ALLELE, get_num_allele); diff --git a/src/ReadByVariant.cpp b/src/ReadByVariant.cpp index 4aabeee..b0d1888 100755 --- a/src/ReadByVariant.cpp +++ b/src/ReadByVariant.cpp @@ -2,7 +2,7 @@ // // ReadByVariant.cpp: Read data variant by variant // -// Copyright (C) 2013-2022 Xiuwen Zheng +// Copyright (C) 2013-2024 Xiuwen Zheng // // This file is part of SeqArray. // @@ -391,7 +391,6 @@ void CApply_Variant_Dosage::ReadDosage(int *Base) { int *p = (int *)ExtPtr2.get(); int missing = _ReadGenoData(p); - // count the number of reference allele if (Ploidy == 2) // diploid { @@ -417,7 +416,6 @@ void CApply_Variant_Dosage::ReadDosageAlt(int *Base) { int *p = (int *)ExtPtr2.get(); int missing = _ReadGenoData(p); - // count the number of reference allele if (Ploidy == 2) // diploid { @@ -428,29 +426,54 @@ void CApply_Variant_Dosage::ReadDosageAlt(int *Base) int cnt = 0; for (int m=Ploidy; m > 0; m--, p++) { - if (*p != 0) + if (*p == missing) { - if (cnt != NA_INTEGER) cnt ++; - } else if (*p == missing) cnt = NA_INTEGER; + } else if (*p != 0) + { + if (cnt != NA_INTEGER) cnt ++; + } } *Base ++ = cnt; } } } +void CApply_Variant_Dosage::ReadDosageAlt_p(int *Base) +{ + int *p = (int *)ExtPtr2.get(); + int missing = _ReadGenoData(p); + // count the number of reference allele + /* + if (Ploidy == 2) // diploid + { + vec_i32_cnt_dosage_alt2_p(p, Base, SampNum, 0, missing, NA_INTEGER); + } else */ { + for (int n=SampNum; n > 0; n--) + { + int cnt = 0, non_miss = Ploidy; + for (int m=Ploidy; m > 0; m--, p++) + { + if (*p == missing) + non_miss --; + else if (*p != 0) cnt ++; + } + *Base ++ = (non_miss > 0) ? cnt : NA_INTEGER; + } + } +} + void CApply_Variant_Dosage::ReadDosage(C_UInt8 *Base) { C_UInt8 *p = (C_UInt8 *)ExtPtr2.get(); C_UInt8 missing = _ReadGenoData(p); - // count the number of reference allele if (Ploidy == 2) // diploid { vec_i8_cnt_dosage2((int8_t *)p, (int8_t *)Base, SampNum, 0, missing, NA_RAW); } else { - C_UInt8 *p = (C_UInt8 *)ExtPtr.get(); + const C_UInt8 *p = (const C_UInt8 *)ExtPtr.get(); for (int n=SampNum; n > 0; n--) { C_UInt8 cnt = 0; @@ -471,30 +494,55 @@ void CApply_Variant_Dosage::ReadDosageAlt(C_UInt8 *Base) { C_UInt8 *p = (C_UInt8 *)ExtPtr2.get(); C_UInt8 missing = _ReadGenoData(p); - // count the number of reference allele if (Ploidy == 2) // diploid { vec_i8_cnt_dosage_alt2((int8_t *)p, (int8_t *)Base, SampNum, 0, missing, NA_RAW); } else { - C_UInt8 *p = (C_UInt8 *)ExtPtr.get(); for (int n=SampNum; n > 0; n--) { C_UInt8 cnt = 0; for (int m=Ploidy; m > 0; m--, p++) { - if (*p != 0) + if (*p == missing) { - if (cnt != NA_RAW) cnt ++; - } else if (*p == missing) cnt = NA_RAW; + } else if (*p != 0) + { + if (cnt != NA_RAW) cnt ++; + } } *Base ++ = cnt; } } } +void CApply_Variant_Dosage::ReadDosageAlt_p(C_UInt8 *Base) +{ + C_UInt8 *p = (C_UInt8 *)ExtPtr2.get(); + C_UInt8 missing = _ReadGenoData(p); + // count the number of reference allele + /* + if (Ploidy == 2) // diploid + { + vec_i8_cnt_dosage_alt2_p((int8_t *)p, (int8_t *)Base, SampNum, 0, + missing, NA_RAW); + } else */ { + for (int n=SampNum; n > 0; n--) + { + C_UInt8 cnt = 0, non_miss = Ploidy; + for (int m=Ploidy; m > 0; m--, p++) + { + if (*p == missing) + non_miss --; + else if (*p != 0) cnt ++; + } + *Base ++ = (non_miss > 0) ? cnt : NA_RAW; + } + } +} + // ===================================================================== diff --git a/src/ReadByVariant.h b/src/ReadByVariant.h index 6dadde7..81f78c3 100755 --- a/src/ReadByVariant.h +++ b/src/ReadByVariant.h @@ -2,7 +2,7 @@ // // ReadByVariant.h: Read data variant by variant // -// Copyright (C) 2013-2022 Xiuwen Zheng +// Copyright (C) 2013-2024 Xiuwen Zheng // // This file is part of SeqArray. // @@ -131,10 +131,14 @@ class COREARRAY_DLL_LOCAL CApply_Variant_Dosage: public CApply_Variant_Geno void ReadDosage(int *Base); /// read dosages of alternative alleles in 32-bit integer void ReadDosageAlt(int *Base); + /// read dosages of alternative alleles in 32-bit integer, allowing partial missing + void ReadDosageAlt_p(int *Base); /// read dosages in unsigned 8-bit intetger void ReadDosage(C_UInt8 *Base); /// read dosages of alternative alleles in unsigned 8-bit intetger void ReadDosageAlt(C_UInt8 *Base); + /// read dosages of alternative alleles in unsigned 8-bit intetger, allowing partial missing + void ReadDosageAlt_p(C_UInt8 *Base); };