46 lines
1.6 KiB
Python
46 lines
1.6 KiB
Python
def qc_charr_score(self, df: pd.DataFrame) -> float:
|
|
"""
|
|
calculate charr score
|
|
Args:
|
|
df(pd.DataFrame): dataframe of parser vcf
|
|
Returns:
|
|
float, charr score
|
|
"""
|
|
|
|
def charr_score(row: pd.Series) -> float:
|
|
"""
|
|
charr score for one site
|
|
Args:
|
|
row(pd.Series): one site
|
|
Returns:
|
|
float, charr score
|
|
"""
|
|
ad = [int(i) for i in row["ad"].split(",")] # allele depth
|
|
freq = float(row["freq"])
|
|
if (ad[0] + ad[1]) == 0 or freq == 0:
|
|
return 0 # The denominator cannot be zero
|
|
return ad[0] / (freq * (ad[0] + ad[1]))
|
|
|
|
# autosome
|
|
data = df[df["chrom"].isin([f"chr{i}" for i in range(1, 23)])]
|
|
|
|
data = data[data["gt"].isin(["1/1", "1|1"])] # Homogenous variant sites
|
|
data = data[(data["dp"] > 30) & (data["gq"] > 20)] # filter
|
|
|
|
def get_freq(row: pd.Series) -> float:
|
|
"""
|
|
get reference freq for one site in population
|
|
Args:
|
|
row(pd.Series): one site
|
|
Returns:
|
|
float, reference freq
|
|
"""
|
|
key = (row["chrom"], row["position"])
|
|
return self.charr_required_freq.get(key, 0.0001) # If not, fill in the minimum value to avoid division by 0
|
|
|
|
data["freq"] = data.apply(get_freq, axis=1)
|
|
if data.shape[0] == 0:
|
|
return 0.0 # no site after filter
|
|
score = round(data.apply(charr_score, axis=1).sum() / data.shape[0], 4) # mean and retain four decimal places
|
|
|
|
return score |