From 935c87a74bbf9c1fee898c093c1b066f43f4c4ef Mon Sep 17 00:00:00 2001 From: rdemko2332 Date: Wed, 25 Mar 2026 12:13:32 -0400 Subject: [PATCH 1/6] Pushing new runGeneCNVAndPloidyQuery script. Functionality for new merged workflow and postgres --- Main/bin/runGeneCNVAndPloidyQuery | 93 ++++++++++++++++++------------- 1 file changed, 53 insertions(+), 40 deletions(-) diff --git a/Main/bin/runGeneCNVAndPloidyQuery b/Main/bin/runGeneCNVAndPloidyQuery index 818dc8f29..7f5f9d039 100755 --- a/Main/bin/runGeneCNVAndPloidyQuery +++ b/Main/bin/runGeneCNVAndPloidyQuery @@ -7,43 +7,30 @@ use GUS::ObjRelP::DbiDatabase; use GUS::Supported::GusConfig; use CBIL::Util::PropertySet; -my ($gusConfigFile,$organismAbbrev,$geneSourceIdOrthologFile,$chrsForCalcsFile); -&GetOptions("organismAbbrev=s" => \$organismAbbrev, +my ($gusConfigFile,$taxonId,$orthoGroupFile,$geneSourceIdOrthologFile,$chrsForCalcsFile); +&GetOptions("taxonId=s" => \$taxonId, + "orthoGroupFile=s" => \$orthoGroupFile, "geneSourceIdOrthologFile=s" => \$geneSourceIdOrthologFile, "chrsForCalcsFile=s" => \$chrsForCalcsFile); -my $ploidy = 2; - -my $geneSourceSql = "with sequence as ( - select gf.source_id as gene_source_id - , gf.na_feature_id - , ns.source_id as contig_source_id - , ns.source_id as sequence_source_id - , ns.TAXON_ID - from dots.genefeature gf - , DOTS.NASEQUENCE ns - , SRES.ONTOLOGYTERM ot - where gf.na_sequence_id = ns.na_sequence_id - and ot.name = 'chromosome' - and ns.SEQUENCE_ONTOLOGY_ID = ot.ONTOLOGY_TERM_ID - and ns.taxon_id = (select taxon_id from apidb.organism where abbrev = '$organismAbbrev') - ), orthologs as ( - select gf.na_feature_id, sg.name - from dots.genefeature gf - , dots.SequenceSequenceGroup ssg - , dots.SequenceGroup sg - , core.TableInfo ti - where gf.na_feature_id = ssg.sequence_id - and ssg.sequence_group_id = sg.sequence_group_id - and ssg.source_table_id = ti.table_id - and ti.name = 'GeneFeature' - ) - select s.gene_source_id - , o.name - from sequence s - , orthologs o - where s.na_feature_id = o.na_feature_id"; - -my $chrsForCalcsSql = "select ns.source_id from dots.nasequence ns, sres.ontologyterm ot where ot.name = 'chromosome' and ot.ontology_term_id = ns.sequence_ontology_id and ns.taxon_id = (select taxon_id from apidb.organism where abbrev = '$organismAbbrev')"; + +my $proteinToGeneSql = " +SELECT aas.source_id AS protein_source_id, + gf.source_id AS gene_source_id +FROM dots.AASequence aas +JOIN dots.TranslatedAASequence tas ON aas.aa_sequence_id = tas.aa_sequence_id +JOIN dots.TranslatedAAFeature taf ON taf.aa_sequence_id = tas.aa_sequence_id +JOIN dots.Transcript t ON taf.na_feature_id = t.na_feature_id +JOIN dots.GeneFeature gf ON t.parent_id = gf.na_feature_id +WHERE aas.subclass_view = 'TranslatedAASequence' + AND aas.taxon_id = $taxonId + AND aas.taxon_id IN ( + SELECT taxon_id + FROM apidb.organism + WHERE is_annotated_genome = 1 + ) +"; + +my $chrsForCalcsSql = "select ns.source_id from dots.nasequence ns, sres.ontologyterm ot where ot.name = 'chromosome' and ot.ontology_term_id = ns.sequence_ontology_id and ns.taxon_id = $taxonId"; $gusConfigFile = $ENV{GUS_HOME}."/config/gus.config"; die "Config file $gusConfigFile does not exist" unless -e $gusConfigFile; @@ -59,15 +46,41 @@ my $db = GUS::ObjRelP::DbiDatabase-> new($gusConfig->{props}->{dbiDsn}, my $dbh = $db->getQueryHandle(); -my $orthoMclStmt = $dbh->prepare($geneSourceSql); -$orthoMclStmt->execute(); +my $proteinToGeneStmt = $dbh->prepare($proteinToGeneSql); +$proteinToGeneStmt->execute(); + +my %proteinToGene; +while (my @row = $proteinToGeneStmt->fetchrow_array()){ + $proteinToGene{$row[0]} = $row[1]; +} -open(GENE,">$geneSourceIdOrthologFile"); -while (my @row = $orthoMclStmt->fetchrow_array()){ - print GENE "$row[0]\t$row[1]\n"; +my %proteinToGroup; +open(GROUPS, "<$orthoGroupFile") or die "Cannot open $orthoGroupFile: $!"; +while (my $line = ) { + chomp $line; + my ($groupId, $proteinList) = split(/:\s*/, $line, 2); + next unless defined $proteinList; + foreach my $protein (split(/\s+/, $proteinList)) { + $proteinToGroup{$protein} = $groupId; + } +} +close GROUPS; + +my @proteinsWithNoGroup; +open(GENE, ">$geneSourceIdOrthologFile") or die "Cannot open $geneSourceIdOrthologFile: $!"; +while (my ($protein, $gene) = each %proteinToGene) { + if (my $group = $proteinToGroup{$protein}) { + print GENE "$gene\t$group\n"; + } else { + push @proteinsWithNoGroup, $protein; + } } close GENE; +if (@proteinsWithNoGroup) { + die "The following proteins have no group assignment in $orthoGroupFile:\n" . join("\n", @proteinsWithNoGroup) . "\n"; +} + my $chrsForCalcs = $dbh->prepare($chrsForCalcsSql); $chrsForCalcs->execute(); From 9286b0c7f5969b1cb01ff5dd4bf6fd78645c9502 Mon Sep 17 00:00:00 2001 From: rdemko2332 Date: Wed, 25 Mar 2026 13:15:53 -0400 Subject: [PATCH 2/6] Adding error message if chrsToCalcsFile failed to be opened --- Main/bin/runGeneCNVAndPloidyQuery | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Main/bin/runGeneCNVAndPloidyQuery b/Main/bin/runGeneCNVAndPloidyQuery index 7f5f9d039..035c07686 100755 --- a/Main/bin/runGeneCNVAndPloidyQuery +++ b/Main/bin/runGeneCNVAndPloidyQuery @@ -84,7 +84,7 @@ if (@proteinsWithNoGroup) { my $chrsForCalcs = $dbh->prepare($chrsForCalcsSql); $chrsForCalcs->execute(); -open(CHRS,">$chrsForCalcsFile"); +open(CHRS, ">$chrsForCalcsFile") or die "Cannot open $chrsForCalcsFile: $!"; while (my @row = $chrsForCalcs->fetchrow_array()){ print CHRS "$row[0]\t\n"; } From 336ca6c78e097337509945c03e13b12dbd8a21c4 Mon Sep 17 00:00:00 2001 From: rdemko2332 Date: Wed, 25 Mar 2026 13:28:40 -0400 Subject: [PATCH 3/6] Not throwing error, just reporting --- Main/bin/runGeneCNVAndPloidyQuery | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Main/bin/runGeneCNVAndPloidyQuery b/Main/bin/runGeneCNVAndPloidyQuery index 035c07686..b8fa16143 100755 --- a/Main/bin/runGeneCNVAndPloidyQuery +++ b/Main/bin/runGeneCNVAndPloidyQuery @@ -78,7 +78,7 @@ while (my ($protein, $gene) = each %proteinToGene) { close GENE; if (@proteinsWithNoGroup) { - die "The following proteins have no group assignment in $orthoGroupFile:\n" . join("\n", @proteinsWithNoGroup) . "\n"; + print STDERR "The following proteins have no group assignment in $orthoGroupFile:\n" . join("\n", @proteinsWithNoGroup) . "\n"; } my $chrsForCalcs = $dbh->prepare($chrsForCalcsSql); From b7c3439ae8efeba6adf4ea289145b84a55d4ad8e Mon Sep 17 00:00:00 2001 From: rdemko2332 Date: Wed, 25 Mar 2026 13:39:15 -0400 Subject: [PATCH 4/6] Resolving sequence syntax change --- Main/bin/runGeneCNVAndPloidyQuery | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/Main/bin/runGeneCNVAndPloidyQuery b/Main/bin/runGeneCNVAndPloidyQuery index b8fa16143..d3d85e0a2 100755 --- a/Main/bin/runGeneCNVAndPloidyQuery +++ b/Main/bin/runGeneCNVAndPloidyQuery @@ -69,7 +69,12 @@ close GROUPS; my @proteinsWithNoGroup; open(GENE, ">$geneSourceIdOrthologFile") or die "Cannot open $geneSourceIdOrthologFile: $!"; while (my ($protein, $gene) = each %proteinToGene) { - if (my $group = $proteinToGroup{$protein}) { + my $group = $proteinToGroup{$protein}; + unless ($group) { + (my $altProtein = $protein) =~ s/:/\_/g; + $group = $proteinToGroup{$altProtein}; + } + if ($group) { print GENE "$gene\t$group\n"; } else { push @proteinsWithNoGroup, $protein; From 3aabc51d73ae10660859c40ea6aa08096fd4a744 Mon Sep 17 00:00:00 2001 From: Richard Demko Date: Wed, 25 Mar 2026 14:53:40 -0400 Subject: [PATCH 5/6] Copying gene trees to persistent cache --- .../perl/WorkflowSteps/UpdateOrthoPeripheralPersistentCache.pm | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Main/lib/perl/WorkflowSteps/UpdateOrthoPeripheralPersistentCache.pm b/Main/lib/perl/WorkflowSteps/UpdateOrthoPeripheralPersistentCache.pm index 58467597a..d7e6f7b07 100644 --- a/Main/lib/perl/WorkflowSteps/UpdateOrthoPeripheralPersistentCache.pm +++ b/Main/lib/perl/WorkflowSteps/UpdateOrthoPeripheralPersistentCache.pm @@ -68,6 +68,8 @@ sub run { $self->runCmd(0, "cp -r ${preprocessedDataCache}/OrthoMCL/OrthoMCL_peripheralGroups/genesAndProteins/${nextflowWorkflow}_${nextflowBranch}/**/similar_groups.tsv ${preprocessedDataCache}/OrthoMCL/OrthoMCL_peripheralGroups/officialDiamondCache/"); + $self->runCmd(0, "cp -r ${preprocessedDataCache}/OrthoMCL/OrthoMCL_peripheralGroups/genesAndProteins/${nextflowWorkflow}_${nextflowBranch}/**/geneTrees ${preprocessedDataCache}/OrthoMCL/OrthoMCL_peripheralGroups/officialDiamondCache/"); + $self->runCmd(0, "rm -rf ${preprocessedDataCache}/OrthoMCL/OrthoMCL_peripheralGroups/genesAndProteins/"); $self->runCmd(0, "tar -czf ${preprocessedDataCache}/OrthoMCL/OrthoMCL_peripheralGroups/officialDiamondCache/peripheralCacheDir.tar.gz -C ${preprocessedDataCache}/OrthoMCL/OrthoMCL_peripheralGroups/officialDiamondCache peripheralCacheDir"); From a174922de43e5e9a037f05c0a1ebc3ac9db2e02d Mon Sep 17 00:00:00 2001 From: Richard Demko Date: Wed, 25 Mar 2026 14:54:44 -0400 Subject: [PATCH 6/6] Removing unneeded rm commands --- .../perl/WorkflowSteps/UpdateOrthoPeripheralPersistentCache.pm | 2 -- 1 file changed, 2 deletions(-) diff --git a/Main/lib/perl/WorkflowSteps/UpdateOrthoPeripheralPersistentCache.pm b/Main/lib/perl/WorkflowSteps/UpdateOrthoPeripheralPersistentCache.pm index d7e6f7b07..777eec9d8 100644 --- a/Main/lib/perl/WorkflowSteps/UpdateOrthoPeripheralPersistentCache.pm +++ b/Main/lib/perl/WorkflowSteps/UpdateOrthoPeripheralPersistentCache.pm @@ -32,10 +32,8 @@ sub run { $self->runCmd(0, "cp -r $checkSumFile ${preprocessedDataCache}/OrthoMCL/OrthoMCL_peripheralGroups/officialDiamondCache/checkSum.tsv"); - $self->runCmd(0, "rm -rf ${preprocessedDataCache}/OrthoMCL/OrthoMCL_peripheralGroups/officialDiamondCache/groupFastas"); $self->runCmd(0, "cp -r ${preprocessedDataCache}/OrthoMCL/OrthoMCL_peripheralGroups/genesAndProteins/${nextflowWorkflow}_${nextflowBranch}/**/groupFastas ${preprocessedDataCache}/OrthoMCL/OrthoMCL_peripheralGroups/officialDiamondCache/"); - $self->runCmd(0, "rm -rf ${preprocessedDataCache}/OrthoMCL/OrthoMCL_peripheralGroups/officialDiamondCache/peripheralCacheDir"); $self->runCmd(0, "cp -r ${preprocessedDataCache}/OrthoMCL/OrthoMCL_peripheralGroups/genesAndProteins/${nextflowWorkflow}_${nextflowBranch}/**/newPeripheralDiamondCache/ ${preprocessedDataCache}/OrthoMCL/OrthoMCL_peripheralGroups/officialDiamondCache/peripheralCacheDir"); $self->runCmd(0, "cp -r ${preprocessedDataCache}/OrthoMCL/OrthoMCL_peripheralGroups/genesAndProteins/${nextflowWorkflow}_${nextflowBranch}/**/buildVersion.txt ${preprocessedDataCache}/OrthoMCL/OrthoMCL_peripheralGroups/officialDiamondCache/residualBuildVersion.txt");