Statistics Results

Using Hypergeometric Distribution to compute P-values

  • Using R statistical package, phyper()
  • Hypergeometric Distribution Model
    • white balls = pmids marked with a particular MeSH term (m)
    • black balls = pmids not marked with a particular MeSH term (n) = N-m
    • total balls (N)
    • set drawn = pmids in GeneRIFs for a gene
    • x = # of pmids marked with the MeSH term among the GeneRIFs for a gene
  • Use the set of articles marked "Diseases Category" (Category C) as N
    • April 17 2007: 8581623 = 8.5 million
  • List of children for each mesh term
CREATE TABLE IF NOT EXISTS braindisease_child 
( term VARCHAR(256), 
  child VARCHAR(256), 
  PRIMARY KEY (term,child) 
) AS 
SELECT DISTINCT major.term, child.term AS child 
FROM braindisease AS major, braindisease AS child  
WHERE major.tree_num=child.tree_num OR child.tree_num LIKE CONCAT(major.tree_num,'.%');
  • Count for each term, # of citations
CREATE VIEW term_citations AS 
SELECT bd.term, COUNT(DISTINCT pmid) 
FROM pubmed_mesh, braindisease_child AS bd 
WHERE pubmed_mesh.term=bd.child GROUP BY bd.term;
  • k = # of unique pmids among GeneRIFs for a gene
CREATE TABLE gene_citations (
 gene_id INT, 
 num_pmid INT,
 PRIMARY KEY (gene_id)
) AS 
SELECT gene_id, COUNT(DISTINCT generif.pmid) AS num_pmid  FROM generif GROUP BY gene_id;
  • Generate table of values for phyper:
    • general version - do computation of n in R
SELECT gene.gene_id, locus, term_citations.term, gene_term_citations.num_pmid AS x,
term_citations.num_pmid AS m,
gene_citations.num_pmid AS k
FROM gene, gene_term_citations, gene_citations, term_citations 
WHERE
gene.gene_id=gene_term_citations.gene_id AND
gene.gene_id=gene_citations.gene_id AND
term_citations.term=gene_term_citations.term
  • 15806221-m: All PubMed articles with MeSH
  • 660538-m: All PubMed articles with Brain Disease MeSH
  • Statistics in R

Significance: Looking wrt. all Disease Pubmed Articles
Relevance: Looking wrt. Brain Disease Pubmed Articles

geneterm_stats <- read.delim("gene-term.txt")
geneterm_stats <- cbind(geneterm_stats, phyper (geneterm_stats[,4], geneterm_stats[,5], 15806221-geneterm_stats[,5], geneterm_stats[,6], lower.tail=FALSE))
colnames(geneterm_stats)[7] = "Significance"
geneterm_stats <- cbind(geneterm_stats, phyper (geneterm_stats[,4], geneterm_stats[,5], 660538-geneterm_stats[,5], geneterm_stats[,6], lower.tail=FALSE))
colnames(geneterm_stats)[8] = "Relevance"
geneterm_stats.sorted <- geneterm_stats[order(geneterm_stats[,7]),]

Version for homologene

homologene_stats<-read.delim("generif_homologene_results.txt")
homologene_stats<-cbind(homologene_stats, phyper(homologene_stats[,"braindisease_refs"], homologene_stats[,"generif_refs"], 15806221-homologene_stats[,"generif_refs"], homologene_stats[,"term_refs"], lower.tail=FALSE))
colnames(homologene_stats)[7] = "Significance"
homologene_stats<-cbind(homologene_stats, phyper(homologene_stats[,"braindisease_refs"], homologene_stats[,"generif_refs"], 660538-homologene_stats[,"generif_refs"], homologene_stats[,"term_refs"], lower.tail=FALSE))
colnames(homologene_stats)[8] = "Relevance"

Archive of ideas for Hypergeometric Distribution

SQL

  • currently NOT using views here due to MySQL bugs
CREATE TABLE gene_term_citations AS 
SELECT gene_id, major.term, COUNT(DISTINCT generif.pmid) AS num_pmid 
FROM generif, pubmed_mesh, braindisease AS bd, braindisease AS major 
WHERE generif.pmid=pubmed_mesh.pmid AND
    pubmed_mesh.term=bd.term AND
    (bd.tree_num=major.tree_num OR
     bd.tree_num LIKE CONCAT(major.tree_num, '.%'))
GROUP BY gene_id, major.term;
id select_type table type possible_keys key key_len ref rows Extra
1 SIMPLE bdc index child PRIMARY 516 NULL 1091 Using index; Using temporary; Using filesort
1 SIMPLE pubmed_mesh ref PRIMARY,pmid PRIMARY 258 warrendb.bdc.child 614 Using index
1 SIMPLE generif index pmid PRIMARY 40 NULL 21943 Using index
1 SIMPLE related_articles eq_ref PRIMARY PRIMARY 8 warrendb.generif.pmid,warrendb.pubmed_mesh.pmid 1 Using index
CREATE TABLE gene_term_related_citations 
( gene_id int, 
  term VARCHAR(256), 
  num_pmid int,
  PRIMARY KEY (gene_id, term) 
) AS 
SELECT gene_id, bdc.term, COUNT(DISTINCT related_articles.related_pmid) AS num_pmid
FROM generif, pubmed_mesh, braindisease_child AS bdc, related_articles
WHERE generif.pmid=related_articles.pmid AND
    related_articles.related_pmid=pubmed_mesh.pmid AND
    pubmed_mesh.term=bdc.child    
GROUP BY gene_id, bdc.term;
  • List of children for each mesh term
CREATE TABLE IF NOT EXISTS braindisease_child 
( term VARCHAR(256), 
  child VARCHAR(256), 
  PRIMARY KEY (term,child) 
) AS 
SELECT DISTINCT major.term, child.term AS child 
FROM braindisease AS major, braindisease AS child  
WHERE major.tree_num=child.tree_num OR child.tree_num LIKE CONCAT(major.tree_num,'.%');
  • N = total number of citations considered
    • Only those with Brain Disease MeSH terms
  • m = # of pmids with a particular MeSH term
CREATE VIEW term_citations AS 
SELECT bd.term, COUNT(DISTINCT pmid) 
FROM pubmed_mesh, braindisease_child AS bd 
WHERE pubmed_mesh.term=bd.child GROUP BY bd.term;
  • Only those with GeneRIFs and Brain Disease MeSH terms
CREATE TABLE generifterm_citations AS
SELECT major.term, COUNT(DISTINCT generif.pmid) AS num_pmid 
FROM generif, pubmed_mesh, braindisease AS bd, braindisease AS major 
WHERE generif.pmid=pubmed_mesh.pmid AND
    pubmed_mesh.term=bd.term AND
    (bd.tree_num=major.tree_num OR
     bd.tree_num LIKE CONCAT(major.tree_num, '.%'))
GROUP BY major.term;
  • Generate table of values for phyper:
    • general version - do computation of n in R
SELECT gene.gene_id, locus, term_citations.term, gene_term_citations.num_pmid AS x,
term_citations.num_pmid AS m,
gene_citations.num_pmid AS k
FROM gene, gene_term_citations, gene_citations, term_citations 
WHERE
gene.gene_id=gene_term_citations.gene_id AND
gene.gene_id=gene_citations.gene_id AND
term_citations.term=gene_term_citations.term
  • 15806221-m: All PubMed articles with MeSH
  • 660538-m: All PubMed articles with Brain Disease MeSH
  • All PubMed articles with Generif citations
SELECT gene.gene_id, locus, generifterm_citations.term, gene_term_citations.num_pmid AS x,
generifterm_citations.num_pmid AS m,
351-generifterm_citations.num_pmid AS n,
gene_citations.num_pmid AS k
FROM gene, gene_term_citations, gene_citations, generifterm_citations 
WHERE
gene.gene_id=gene_term_citations.gene_id AND
gene.gene_id=gene_citations.gene_id AND
generifterm_citations.term=gene_term_citations.term
  • Only those with GeneRIFs and Brain Disease MeSH terms
CREATE TABLE generifterm_citations AS
SELECT major.term, COUNT(DISTINCT generif.pmid) AS num_pmid 
FROM generif, pubmed_mesh, braindisease AS bd, braindisease AS major 
WHERE generif.pmid=pubmed_mesh.pmid AND
    pubmed_mesh.term=bd.term AND
    (bd.tree_num=major.tree_num OR
     bd.tree_num LIKE CONCAT(major.tree_num, '.%'))
GROUP BY major.term;
  • k = # of unique pmids among GeneRIFs for a gene
CREATE TABLE gene_citations AS
SELECT gene_id, COUNT(DISTINCT generif.pmid) AS num_pmid 
FROM generif, pubmed_mesh
WHERE generif.pmid=pubmed_mesh.pmid
GROUP BY gene_id;
  • All PubMed articles with Generif citations
SELECT gene.gene_id, locus, generifterm_citations.term, gene_term_citations.num_pmid AS x,
generifterm_citations.num_pmid AS m,
351-generifterm_citations.num_pmid AS n,
gene_citations.num_pmid AS k
FROM gene, gene_term_citations, gene_citations, generifterm_citations 
WHERE
gene.gene_id=gene_term_citations.gene_id AND
gene.gene_id=gene_citations.gene_id AND
generifterm_citations.term=gene_term_citations.term

R Statistics

pubterm_stats <- read.delim("gene_term_pubstats.txt")
pubterm_stats <- cbind(pubterm_stats, phyper (pubterm_stats[,4], pubterm_stats[,5], pubterm_stats[,6], pubterm_stats[,7], lower.tail=FALSE))
colnames(pubterm_stats)[8] = "p-hyper"
pubterm_stats.sorted <- pubterm_stats[order(pubterm_stats[,8]),]
term_stats <- read.delim("gene_term_stats.txt")
term_stats <- cbind(term_stats, phyper (term_stats[,4], term_stats[,5], term_stats[,6], term_stats[,7], lower.tail=FALSE))
colnames(term_stats)[8] = "p-hyper"
term_stats.sorted <- term_stats[order(term_stats[,8]),]
RIFterm_stats <- read.delim("gene_generifterm_stats.txt"
RIFterm_stats <- cbind(RIFterm_stats, phyper (RIFterm_stats[,4], RIFterm_stats[,5], RIFterm_stats[,6], RIFterm_stats[,7], lower.tail=FALSE))
colnames(RIFterm_stats)[8] = "p-hyper"
RIFterm_stats.sorted <- RIFterm_stats[order(RIFterm_stats[,8]),]
geneterm_stats <- read.delim("gene-term.txt")
geneterm_stats <- cbind(geneterm_stats, phyper (geneterm_stats[,4], geneterm_stats[,5], 15806221-geneterm_stats[,5], geneterm_stats[,6], lower.tail=FALSE))
colnames(geneterm_stats)[7] = "Significance"
geneterm_stats <- cbind(geneterm_stats, phyper (geneterm_stats[,4], geneterm_stats[,5], 660538-geneterm_stats[,5], geneterm_stats[,6], lower.tail=FALSE))
colnames(geneterm_stats)[8] = "Relevance"
geneterm_stats.sorted <- geneterm_stats[order(geneterm_stats[,7]),]
Unless otherwise stated, the content of this page is licensed under Creative Commons Attribution-Share Alike 2.5 License.