From bd8e9f0b2d8cc785a49e1cdaa8e886fc2d2d6f58 Mon Sep 17 00:00:00 2001 From: shuaizhou Date: Tue, 16 Dec 2025 11:19:26 +0800 Subject: [PATCH] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=20share.py?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- share.py | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100644 share.py diff --git a/share.py b/share.py new file mode 100644 index 0000000..dcf7fc9 --- /dev/null +++ b/share.py @@ -0,0 +1,46 @@ + def qc_charr_score(self, df: pd.DataFrame) -> float: + """ + calculate charr score + Args: + df(pd.DataFrame): dataframe of parser vcf + Returns: + float, charr score + """ + + def charr_score(row: pd.Series) -> float: + """ + charr score for one site + Args: + row(pd.Series): one site + Returns: + float, charr score + """ + ad = [int(i) for i in row["ad"].split(",")] # allele depth + freq = float(row["freq"]) + if (ad[0] + ad[1]) == 0 or freq == 0: + return 0 # The denominator cannot be zero + return ad[0] / (freq * (ad[0] + ad[1])) + + # autosome + data = df[df["chrom"].isin([f"chr{i}" for i in range(1, 23)])] + + data = data[data["gt"].isin(["1/1", "1|1"])] # Homogenous variant sites + data = data[(data["dp"] > 30) & (data["gq"] > 20)] # filter + + def get_freq(row: pd.Series) -> float: + """ + get reference freq for one site in population + Args: + row(pd.Series): one site + Returns: + float, reference freq + """ + key = (row["chrom"], row["position"]) + return self.charr_required_freq.get(key, 0.0001) # If not, fill in the minimum value to avoid division by 0 + + data["freq"] = data.apply(get_freq, axis=1) + if data.shape[0] == 0: + return 0.0 # no site after filter + score = round(data.apply(charr_score, axis=1).sum() / data.shape[0], 4) # mean and retain four decimal places + + return score \ No newline at end of file