diff --git a/Main/bin/runGeneCNVAndPloidyQuery b/Main/bin/runGeneCNVAndPloidyQuery index 818dc8f29..d3d85e0a2 100755 --- a/Main/bin/runGeneCNVAndPloidyQuery +++ b/Main/bin/runGeneCNVAndPloidyQuery @@ -7,43 +7,30 @@ use GUS::ObjRelP::DbiDatabase; use GUS::Supported::GusConfig; use CBIL::Util::PropertySet; -my ($gusConfigFile,$organismAbbrev,$geneSourceIdOrthologFile,$chrsForCalcsFile); -&GetOptions("organismAbbrev=s" => \$organismAbbrev, +my ($gusConfigFile,$taxonId,$orthoGroupFile,$geneSourceIdOrthologFile,$chrsForCalcsFile); +&GetOptions("taxonId=s" => \$taxonId, + "orthoGroupFile=s" => \$orthoGroupFile, "geneSourceIdOrthologFile=s" => \$geneSourceIdOrthologFile, "chrsForCalcsFile=s" => \$chrsForCalcsFile); -my $ploidy = 2; - -my $geneSourceSql = "with sequence as ( - select gf.source_id as gene_source_id - , gf.na_feature_id - , ns.source_id as contig_source_id - , ns.source_id as sequence_source_id - , ns.TAXON_ID - from dots.genefeature gf - , DOTS.NASEQUENCE ns - , SRES.ONTOLOGYTERM ot - where gf.na_sequence_id = ns.na_sequence_id - and ot.name = 'chromosome' - and ns.SEQUENCE_ONTOLOGY_ID = ot.ONTOLOGY_TERM_ID - and ns.taxon_id = (select taxon_id from apidb.organism where abbrev = '$organismAbbrev') - ), orthologs as ( - select gf.na_feature_id, sg.name - from dots.genefeature gf - , dots.SequenceSequenceGroup ssg - , dots.SequenceGroup sg - , core.TableInfo ti - where gf.na_feature_id = ssg.sequence_id - and ssg.sequence_group_id = sg.sequence_group_id - and ssg.source_table_id = ti.table_id - and ti.name = 'GeneFeature' - ) - select s.gene_source_id - , o.name - from sequence s - , orthologs o - where s.na_feature_id = o.na_feature_id"; - -my $chrsForCalcsSql = "select ns.source_id from dots.nasequence ns, sres.ontologyterm ot where ot.name = 'chromosome' and ot.ontology_term_id = ns.sequence_ontology_id and ns.taxon_id = (select taxon_id from apidb.organism where abbrev = '$organismAbbrev')"; + +my $proteinToGeneSql = " +SELECT aas.source_id AS protein_source_id, + gf.source_id AS gene_source_id +FROM dots.AASequence aas +JOIN dots.TranslatedAASequence tas ON aas.aa_sequence_id = tas.aa_sequence_id +JOIN dots.TranslatedAAFeature taf ON taf.aa_sequence_id = tas.aa_sequence_id +JOIN dots.Transcript t ON taf.na_feature_id = t.na_feature_id +JOIN dots.GeneFeature gf ON t.parent_id = gf.na_feature_id +WHERE aas.subclass_view = 'TranslatedAASequence' + AND aas.taxon_id = $taxonId + AND aas.taxon_id IN ( + SELECT taxon_id + FROM apidb.organism + WHERE is_annotated_genome = 1 + ) +"; + +my $chrsForCalcsSql = "select ns.source_id from dots.nasequence ns, sres.ontologyterm ot where ot.name = 'chromosome' and ot.ontology_term_id = ns.sequence_ontology_id and ns.taxon_id = $taxonId"; $gusConfigFile = $ENV{GUS_HOME}."/config/gus.config"; die "Config file $gusConfigFile does not exist" unless -e $gusConfigFile; @@ -59,19 +46,50 @@ my $db = GUS::ObjRelP::DbiDatabase-> new($gusConfig->{props}->{dbiDsn}, my $dbh = $db->getQueryHandle(); -my $orthoMclStmt = $dbh->prepare($geneSourceSql); -$orthoMclStmt->execute(); +my $proteinToGeneStmt = $dbh->prepare($proteinToGeneSql); +$proteinToGeneStmt->execute(); + +my %proteinToGene; +while (my @row = $proteinToGeneStmt->fetchrow_array()){ + $proteinToGene{$row[0]} = $row[1]; +} -open(GENE,">$geneSourceIdOrthologFile"); -while (my @row = $orthoMclStmt->fetchrow_array()){ - print GENE "$row[0]\t$row[1]\n"; +my %proteinToGroup; +open(GROUPS, "<$orthoGroupFile") or die "Cannot open $orthoGroupFile: $!"; +while (my $line = ) { + chomp $line; + my ($groupId, $proteinList) = split(/:\s*/, $line, 2); + next unless defined $proteinList; + foreach my $protein (split(/\s+/, $proteinList)) { + $proteinToGroup{$protein} = $groupId; + } +} +close GROUPS; + +my @proteinsWithNoGroup; +open(GENE, ">$geneSourceIdOrthologFile") or die "Cannot open $geneSourceIdOrthologFile: $!"; +while (my ($protein, $gene) = each %proteinToGene) { + my $group = $proteinToGroup{$protein}; + unless ($group) { + (my $altProtein = $protein) =~ s/:/\_/g; + $group = $proteinToGroup{$altProtein}; + } + if ($group) { + print GENE "$gene\t$group\n"; + } else { + push @proteinsWithNoGroup, $protein; + } } close GENE; +if (@proteinsWithNoGroup) { + print STDERR "The following proteins have no group assignment in $orthoGroupFile:\n" . join("\n", @proteinsWithNoGroup) . "\n"; +} + my $chrsForCalcs = $dbh->prepare($chrsForCalcsSql); $chrsForCalcs->execute(); -open(CHRS,">$chrsForCalcsFile"); +open(CHRS, ">$chrsForCalcsFile") or die "Cannot open $chrsForCalcsFile: $!"; while (my @row = $chrsForCalcs->fetchrow_array()){ print CHRS "$row[0]\t\n"; } diff --git a/Main/lib/perl/WorkflowSteps/UpdateOrthoPeripheralPersistentCache.pm b/Main/lib/perl/WorkflowSteps/UpdateOrthoPeripheralPersistentCache.pm index 2c7cd538a..20d7c26f7 100644 --- a/Main/lib/perl/WorkflowSteps/UpdateOrthoPeripheralPersistentCache.pm +++ b/Main/lib/perl/WorkflowSteps/UpdateOrthoPeripheralPersistentCache.pm @@ -34,6 +34,7 @@ sub run { $self->runCmd(0, "cp -r ${preprocessedDataCache}/OrthoMCL/OrthoMCL_peripheralGroups/genesAndProteins/${nextflowWorkflow}_${nextflowBranch}/**/groupFastas ${preprocessedDataCache}/OrthoMCL/OrthoMCL_peripheralGroups/officialDiamondCache/"); + $self->runCmd(0, "cp -r ${preprocessedDataCache}/OrthoMCL/OrthoMCL_peripheralGroups/genesAndProteins/${nextflowWorkflow}_${nextflowBranch}/**/residualGroupFastas ${preprocessedDataCache}/OrthoMCL/OrthoMCL_peripheralGroups/officialDiamondCache/"); $self->runCmd(0, "cp -r ${preprocessedDataCache}/OrthoMCL/OrthoMCL_peripheralGroups/genesAndProteins/${nextflowWorkflow}_${nextflowBranch}/**/newPeripheralDiamondCache/ ${preprocessedDataCache}/OrthoMCL/OrthoMCL_peripheralGroups/officialDiamondCache/peripheralCacheDir");