FFPopSim  2.0
Library for Population Genetics
 All Classes Files Functions Variables Macros Pages
ffpopsim_highd.h
Go to the documentation of this file.
1 
31 #ifndef FFPOPSIM_HIGHD_H_
32 #define FFPOPSIM_HIGHD_H_
33 #include "ffpopsim_generic.h"
34 
35 #define HCF_MEMERR -131545
36 #define HCF_BADARG -131546
37 #define HCF_VERBOSE 0
38 #define WORDLENGTH 28 //length used to chop bitsets into words
39 
40 using namespace std;
41 
42 
51 struct coeff_t {
52  int order;
53  double value;
54  int *loci;
55  coeff_t(double value_in, vector <int> loci_in){
56  value=value_in;
57  order=loci_in.size();
58  loci=new int [order];
59  for (int i=0; i<order; i++) loci[i]=loci_in[i];
60  }
61 };
62 
70  double value;
71  int locus;
72  coeff_single_locus_t(double value_in, int locus_in) : value(value_in), locus(locus_in) {};
73 };
74 
90 private:
91  // random number generator
92  gsl_rng *rng;
93  unsigned int seed;
94 
95  // memory management
96  bool hcube_allocated;
97  bool mem;
98  int allocate_mem();
99  int free_mem();
100 
101  // iterators
102  vector<coeff_single_locus_t>::iterator coefficients_single_locus_iter;
103  vector<coeff_t>::iterator coefficients_epistasis_iter;
104 
105  // static array of single locus coefficients (for performance reasons)
106  vector<double> coefficients_single_locus_static;
107 
108 public:
109  // random number generator
111 
112  // attributes
113  int dim;
116  vector <coeff_single_locus_t> coefficients_single_locus;
117  vector <coeff_t> coefficients_epistasis;
118 
119  // setting up
120  hypercube_highd();
121  hypercube_highd(int dim_in, int s=0);
122  virtual ~hypercube_highd();
123  int set_up(int dim_in, int s=0);
124 
125  // get methods
126  unsigned int get_dim(){return dim;}
127  unsigned int get_seed() {return seed;};
128  double get_func(boost::dynamic_bitset<>& genotype);
129  double get_additive_coefficient(int locus);
130  double get_func_diff(boost::dynamic_bitset<>& genotype1, boost::dynamic_bitset<>& genotype2, vector<int> &diffpos);
131 
132  // change the hypercube
133  void reset();
134  void reset_additive();
135  int set_additive_coefficient(double value, int locus, int expected_locus=-1);
136  int add_coefficient(double value, vector <int> loci);
137  int set_random_epistasis_strength(double sigma);
138 };
139 
140 
141 // Control constants
142 #define HP_VERBOSE 0
143 #define NO_GENOTYPE -1
144 #define HP_MINAF 0.02
145 #define MAX_DELTAFITNESS 8
146 #define MAX_POPSIZE 500000
147 #define HP_NOTHING 1e-12
148 #define HP_RANDOM_SAMPLE_FRAC 0.01
149 #define HP_VERY_NEGATIVE -1e15
150 
151 // Error Codes
152 #define HP_BADARG -879564
153 #define HP_MEMERR -986465
154 #define HP_EXPLOSIONWARN 4
155 #define HP_EXTINCTERR 5
156 #define HP_NOBINSERR 6
157 #define HP_WRONGBINSERR 7
158 #define HP_RUNTIMEERR 8
159 
165 struct clone_t {
166  boost::dynamic_bitset<> genotype;
167  vector<double> trait;
168  double fitness;
170  clone_t(int n_traits=0) : genotype(boost::dynamic_bitset<>(0)), trait(n_traits, 0), fitness(0), clone_size(0) {};
171 
172  // Comparison operators check fitness first, genome (big endian) last
173  bool operator==(const clone_t &other) const {return (fitness == other.fitness) && (genotype == other.genotype);}
174  bool operator!=(const clone_t &other) const {return (fitness != other.fitness) || (genotype != other.genotype);}
175  bool operator<(const clone_t &other) const {
176  if(fitness < other.fitness) return true;
177  else if (fitness > other.fitness) return false;
178  else {
179  for(size_t i=0; i < genotype.size(); i++) {
180  if((!genotype[i]) && (other.genotype[i])) return true;
181  else if((genotype[i]) && (!other.genotype[i])) return false;
182  }
183  return false;
184  }
185  }
186  bool operator>(const clone_t &other) const {
187  if(fitness > other.fitness) return true;
188  else if (fitness < other.fitness) return false;
189  else {
190  for(size_t i=0; i < genotype.size(); i++) {
191  if((genotype[i]) && (!other.genotype[i])) return true;
192  else if((!genotype[i]) && (other.genotype[i])) return false;
193  }
194  return false;
195  }
196  }
197 };
198 
199 
200 /*
201  * @brief a class that implements a rooted tree to store genealogies
202  *
203  * Nodes and edges are stored as maps with a key that holds the age (rather the time) when the node lived
204  * and the index in the population at that time. The nodes themselves are sufficient to reconstruct the tree
205  * since they contain keys of parents and children
206  */
207 
208 #ifndef rooted_tree_H_
209 #define rooted_tree_H_
210 #define RT_VERBOSE 0
211 #define RT_VERYLARGE 10000000
212 #define RT_CHILDNOTFOUND -35343
213 #define RT_NODENOTFOUND -35765
214 
215 #include <map>
216 #include <set>
217 #include <vector>
218 #include <iostream>
219 #include <string>
220 #include <sstream>
221 #include <list>
222 #include <gsl/gsl_histogram.h>
223 
224 using namespace std;
225 
226 struct tree_key_t {
227  int index;
228  int age;
229  bool operator==(const tree_key_t &other) {return (age == other.age) && (index == other.index);}
230  bool operator!=(const tree_key_t &other) {return (age != other.age) || (index != other.index);}
231  bool operator<(const tree_key_t &other) const {
232  if(age < other.age) return true;
233  else if (age > other.age) return false;
234  else { return (index<other.index); }
235  }
236  bool operator>(const tree_key_t &other) const {
237  if(age > other.age) return true;
238  else if (age < other.age) return false;
239  else { return (index>other.index); }
240  }
241  tree_key_t(int index=0, int age=0) : index(index), age(age) {};
242 };
243 
244 struct step_t {
245  int pos;
246  int step;
247  bool operator<(const step_t &other) const {
248  if(pos < other.pos) return true;
249  else return false;
250  }
251  bool operator>(const step_t &other) const {
252  if(pos > other.pos) return true;
253  else return false;
254  }
255  bool operator==(const step_t &other) const {
256  if(pos == other.pos) return true;
257  else return false;
258  }
259  step_t(int pos=0, int step=0) : pos(pos), step(step) {};
260 };
261 
262 struct node_t {
264  list < tree_key_t > child_edges;
265  double fitness;
267  vector <step_t> weight_distribution;
270  int crossover[2];
271 };
272 
273 struct edge_t {
276  int segment[2];
277  int length;
279 };
280 
281 struct poly_t {
282  int birth;
284  double effect;
285  double fitness;
287  poly_t(int b=0, int age=0, double e=0, double f=0, double fvar=0) :
288  birth(b), sweep_time(age), effect(e), fitness(f), fitness_variance(fvar) {};
289 };
290 
291 
292 class rooted_tree {
293 public:
294  map < tree_key_t , edge_t > edges;
295  map < tree_key_t , node_t > nodes;
296  vector <tree_key_t> leafs;
299 
300  rooted_tree();
301  virtual ~rooted_tree();
302  void reset();
303  void add_generation(vector <node_t> &new_generation, double mean_fitness);
304  int add_terminal_node(node_t &newNode);
305  tree_key_t erase_edge_node(tree_key_t to_be_erased);
306  tree_key_t bridge_edge_node(tree_key_t to_be_bridged);
307  int external_branch_length();
308  int total_branch_length();
309  int ancestors_at_age(int age, tree_key_t subtree_root, vector <tree_key_t> &ancestors);
310  int update_leaf_to_root(tree_key_t leaf);
311  void update_tree();
312  int calc_weight_distribution(tree_key_t subtree_root);
313  void SFS(gsl_histogram *sfs);
314  tree_key_t get_MRCA(){return MRCA;};
315  int erase_child(map <tree_key_t,node_t>::iterator Pnode, tree_key_t to_be_erased);
316  int delete_extra_children(tree_key_t subtree_root);
317  int delete_one_child_nodes(tree_key_t subtree_root);
318  bool check_node(tree_key_t node);
319  int check_tree_integrity();
320  void clear_tree();
321 
322  // print tree or subtrees
323  string print_newick();
324  string subtree_newick(tree_key_t root);
325  string print_weight_distribution(tree_key_t node_key);
326 
327  // construct subtrees
328  int construct_subtree(vector <tree_key_t> subtree_leafs, rooted_tree &other);
329 
330 
331 };
332 
333 #endif /* rooted_tree_H_ */
334 
335 /*
336  * @brief short wrapper class that handles trees at different places in the genome
337  *
338  * the class contains a vector of rooted_tree instances that hold the genealogy
339  * in different places. In addition, there is a rooted_tree called subtree
340  * that is used on demand
341  *
342  * Created on: Oct 14, 2012
343  * Author: richard
344  */
345 
346 #ifndef MULTILOCUSGENEALOGY_H_
347 #define MULTILOCUSGENEALOGY_H_
349 public:
350  vector <int> loci; //vector of loci (positions on a genome) whose genealogy is to be tracked
351  vector <rooted_tree> trees; //vector of rooted trees (one per locus)
352  vector < vector < node_t > > newGenerations; //used by the evolving class to store the new generation
353 
355  virtual ~multi_locus_genealogy();
356  void track_locus(int new_locus);
357  void reset(){loci.clear(); trees.clear();newGenerations.clear();}
358  void reset_but_loci(){for(unsigned int i=0; i<loci.size(); i++){trees[i].reset();newGenerations[i].clear();}}
359  void add_generation(double baseline);
360  int extend_storage(int n);
361 };
362 #endif /* MULTILOCUSGENEALOGY_H_ */
363 
364 
379 public:
380  // genotype to traits maps, which in turn are used in the trait-to-fitness map
382 
383  // construction / destruction
384  haploid_highd(int L=0, int rng_seed=0, int number_of_traits=1, bool all_polymorphic=false);
385  virtual ~haploid_highd();
386 
387  // the population
388  vector <clone_t> population;
389 
390  // population parameters (read/write)
391  int carrying_capacity; // carrying capacity of the environment (pop size)
392  double outcrossing_rate; // probability of having sex
393  double crossover_rate; // rate of crossover during sex
394  int recombination_model; //model of recombination to be used
395  bool circular; //topology of the chromosome
396  double growth_rate; //growth rate for bottlenecks and the like
397 
398  // mutation rate (only if not all_polymorphic)
399  double get_mutation_rate(){return mutation_rate;}
400  void set_mutation_rate(double m){
401  if(all_polymorphic){
402  if(HP_VERBOSE) cerr<<"Cannot set the mutation rate with all_polymorphic."<<endl;
403  throw HP_BADARG;
404  } else mutation_rate=m;}
405 
406  // pseudo-infinite site model
407  bool is_all_polymorphic(){return all_polymorphic;}
408  vector<poly_t> get_polymorphisms(){return polymorphism;}
409  vector<poly_t> get_fixed_mutations(){return fixed_mutations;}
410  vector<int> get_number_of_mutations(){return number_of_mutations;}
411 
412  // population parameters (read only)
413  int L(){return number_of_loci;}
414  int get_number_of_loci(){return number_of_loci;}
415  int N(){return population_size;}
416  int get_population_size() {return population_size;}
417  int get_generation(){return generation;}
418  void set_generation(int g){generation = g;}
419  int get_number_of_clones(){return number_of_clones;}
420  int get_number_of_traits(){return number_of_traits;}
421  double get_participation_ratio(){return participation_ratio;}
422 
423  // initialization
424  int set_allele_frequencies(double* frequencies, unsigned long N);
425  int set_genotypes(vector <genotype_value_pair_t> gt);
426  int set_wildtype(unsigned long N);
427  int track_locus_genealogy(vector <int> loci);
428 
429  // modify population
430  void add_genotype(boost::dynamic_bitset<> genotype, int n=1);
431 
432  // modify traits
433  int add_trait_coefficient(double value, vector <int> loci, int t=0){return trait[t].add_coefficient(value, loci);}
434  void clear_trait(int t=0){if(t >= number_of_traits) throw (int)HP_BADARG; else trait[t].reset();}
435  void clear_traits(){for(int t=0; t<number_of_traits; t++){trait[t].reset();}}
436  void set_random_trait_epistasis(double epistasis_std,int traitnumber=0){trait[traitnumber].epistatic_std=epistasis_std;}
437 
438  // modify fitness (shortcuts: they only make sense if number_of_traits=1)
439  int add_fitness_coefficient(double value, vector <int> loci){if(number_of_traits>1) throw (int)HP_BADARG; return add_trait_coefficient(value, loci, 0);}
440  void clear_fitness(){if(number_of_traits>1){if(HP_VERBOSE) cerr<<"What do you mean by fitness?"<<endl; throw (int)HP_BADARG;} clear_traits();}
441  void set_random_epistasis(double epistasis_std){if(number_of_traits>1){if(HP_VERBOSE) cerr<<"Please use set_random_trait_epistasis."<<endl; throw (int)HP_BADARG;} trait[0].epistatic_std=epistasis_std;}
442 
443  // evolution
444  int evolve(int gen=1);
445  int bottleneck(int size_of_bottleneck);
446  unsigned int flip_single_locus(int locus);
447 
448  // update traits and fitness and calculate statistics
449  void calc_stat();
450  void unique_clones();
451  vector <int> get_nonempty_clones();
452 
453  // readout
454  // Note: these functions are for the general public and are not expected to be
455  // extremely fast. If speed is a major concern, consider subclassing and working
456  // with protected methods.
457 
458  // random clones
459  int random_clone();
460  int random_clones(unsigned int n_o_individuals, vector <int> *sample);
461 
462  // genotype readout
463  string get_genotype_string(unsigned int i){string gts; boost::to_string(population[i].genotype, gts); return gts;}
464  int distance_Hamming(unsigned int clone1, unsigned int clone2, vector <unsigned int *> *chunks=NULL, unsigned int every=1){return distance_Hamming(population[clone1].genotype, population[clone2].genotype, chunks, every);}
465  int distance_Hamming(boost::dynamic_bitset<> gt1, boost::dynamic_bitset<> gt2, vector<unsigned int *> *chunks=NULL, unsigned int every=1);
466  stat_t get_diversity_statistics(unsigned int n_sample=1000);
467  stat_t get_divergence_statistics(unsigned int n_sample=1000);
468 
469  // allele frequencies
470  double get_allele_frequency(int l) {if (!allele_frequencies_up_to_date){calc_allele_freqs();} return allele_frequencies[l];}
471  double get_derived_allele_frequency(int l) {if (ancestral_state[l]) {return 1.0-get_allele_frequency(l);} else {return get_allele_frequency(l);}}
472 
473  double get_pair_frequency(int locus1, int locus2);
474  vector <double> get_pair_frequencies(vector < vector <int> > *loci);
475  double get_chi(int l) {return 2 * get_allele_frequency(l) - 1;}
476  double get_derived_chi(int l) {return 2 * get_derived_allele_frequency(l) - 1;}
477  double get_chi2(int locus1, int locus2){return get_moment(locus1, locus2)-get_chi(locus1)*get_chi(locus2);}
478  double get_LD(int locus1, int locus2){return 0.25 * get_chi2(locus1, locus2);}
479  double get_moment(int locus1, int locus2){return 4 * get_pair_frequency(locus1, locus2) + 1 - 2 * (get_allele_frequency(locus1) + get_allele_frequency(locus2));}
480 
481  // fitness/phenotype readout
482  void set_trait_weights(double *weights){for(int t=0; t<number_of_traits; t++) trait_weights[t] = weights[t];}
483  double get_trait_weight(int t){return trait_weights[t];}
484  double get_fitness(int n) {calc_individual_fitness(population[n]); return population[n].fitness;}
485  int get_clone_size(int n) {return population[n].clone_size;}
486  double get_trait(int n, int t=0) {calc_individual_traits(population[n]); return population[n].trait[t];}
487  vector<coeff_t> get_trait_epistasis(int t=0){return trait[t].coefficients_epistasis;}
488  stat_t get_fitness_statistics() {update_fitness(); calc_fitness_stat(); return fitness_stat;}
489  stat_t get_trait_statistics(int t=0) {calc_trait_stat(); return trait_stat[t];}
490  double get_trait_covariance(int t1, int t2) {calc_trait_stat(); return trait_covariance[t1][t2];}
491  double get_max_fitness() {return fitness_max;}
492  void update_traits();
493  void update_fitness();
494 
495  // histograms
496  int get_divergence_histogram(gsl_histogram **hist, unsigned int bins=10, vector <unsigned int *> *chunks=NULL, unsigned int every=1, unsigned int n_sample=1000);
497  int get_diversity_histogram(gsl_histogram **hist, unsigned int bins=10, vector <unsigned int *> *chunks=NULL, unsigned int every=1, unsigned int n_sample=1000);
498  int get_fitness_histogram(gsl_histogram **hist, unsigned int bins=10, unsigned int n_sample=1000);
499 
500  // stream I/O
501  int print_allele_frequencies(ostream &out);
502  int read_ms_sample(istream &gts, int skip_locus, int multiplicity);
503  int read_ms_sample_sparse(istream &gts, int skip_locus, int multiplicity, int distance);
504 
505  // genealogy
507 
508 protected:
509  // random number generator
510  gsl_rng* evo_generator;
511  gsl_rng* label_generator;
512  int seed;
513  int get_random_seed();
514  vector <int> random_sample;
515  void produce_random_sample(int size=1000);
516 
517  // population parameters
523  double mutation_rate; // rate of mutation per locus per generation
524 
525  // evolution
526  int mutate();
527  int select_gametes();
528  double relaxation_value();
529  double get_logmean_expfitness(); // Log of the population exp-average of the fitness: log[<exp(F)>_{population}]
530 
531  unsigned int flip_single_locus(unsigned int clonenum, int locus);
532  void shuffle_genotypes();
533  int new_generation();
534 
535  // clone structure
537  int partition_cumulative(vector <unsigned int> &partition_cum);
538  int provide_at_least(int n);
540 
541  // allele_frequencies
545  double *chi1; //symmetric allele frequencies
546  double **chi2; //symmetric two locus correlations
547  bool all_polymorphic; // switch that makes sure every locus is polymorphic in an infinite alleles model when mutation rate is 0
548  vector <int> ancestral_state; //vector, that for each locus keeps track of the ancestral state. by default, all zero
549  vector <poly_t> polymorphism; //vector, that keeps track when an allele was introduced on which background. Only needed in an infinite alleles model
550  vector <poly_t> fixed_mutations; //vector to store all fixed mutations
551  vector <int> number_of_mutations; //vector to store the number of mutations introduced each generation
552  void calc_allele_freqs();
553 
554  // recombination details
556  int *genome; //Auxiliary array holding the positions along the genome
558  void reassortment_pattern();
559  void crossover_pattern();
560  vector <int> sex_gametes; //array holding the indices of gametes
561  int add_recombinants();
562  int recombine(int parent1, int parent2);
563  int recombine_crossover(int parent1, int parent2, int ng);
564 
565  // fitness and traits
566  double fitness_max;
570  void calc_fitness_stat();
571  void calc_trait_stat();
572  void calc_individual_traits(clone_t &tempgt);
573  void calc_individual_fitness(clone_t &tempgt);
574  void calc_individual_traits(int clonenum){calc_individual_traits(population[clonenum]);}
575  void calc_individual_fitness(int clonenum){calc_individual_fitness(population[clonenum]);}
576  void check_individual_maximal_fitness(clone_t &tempgt){fitness_max = fmax(fitness_max, tempgt.fitness);}
577  double get_trait_difference(clone_t &tempgt1, clone_t &tempgt2, vector<int>& diffpos, int traitnum);
578 
579  // phenotype-fitness map. By default, a linear map with equal weights is set, but weights can be reset
580  double *trait_weights;
581  virtual void calc_individual_fitness_from_traits(clone_t &tempgt);
582  virtual void calc_individual_fitness_from_traits(int clonenum) {calc_individual_fitness_from_traits(population[clonenum]);}
583  void add_clone_to_genealogy(int locus, int dest, int parent, int left, int right, int cs, int n);
585 
586 private:
587  // Memory management is private, subclasses must take care only of their own memory
588  bool mem;
589  bool cumulants_mem;
590  int allocate_mem();
591  int free_mem();
592 
593  // These two vectors are used to recycle dead clones
594  vector <int> available_clones;
595  vector <int> clones_needed_for_recombination;
596 
597  boost::dynamic_bitset<> rec_pattern;
598 
599  // counting reference
600  static size_t number_of_instances;
601 };
602 
603 #endif /* FFPOPSIM_HIGHD_H_ */