hpcs-17-subord

Subordination: Providing Resilience to Simultaneous Failure of Multiple Cluster Nodes
git clone https://git.igankevich.com/hpcs-17-subord.git
Log | Files | Refs

refs.bib (3443B)


      1 @book{alexandrescu2001modern,
      2   title={Modern C++ design: generic programming and design patterns applied},
      3   author={Alexandrescu, Andrei},
      4   year={2001},
      5   publisher={Addison-Wesley}
      6 }
      7 
      8 @article{stroustrup2012software,
      9   title={Software development for infrastructure},
     10   author={Stroustrup, Bjarne},
     11   journal={IEEE Computer},
     12   volume={45},
     13   number={1},
     14   pages={47--58},
     15   year={2012}
     16 }
     17 
     18 @inproceedings{zuckerman2011using,
     19   title={Using a codelet program execution model for exascale machines:
     20 		 position paper},
     21   author={Zuckerman, St{\'e}phane and Suetterlein, Joshua and Knauerhase, Rob
     22 		  and Gao, Guang R},
     23   booktitle={Proceedings of the 1st International Workshop on Adaptive
     24 			 Self-Tuning Computing Systems for the Exaflop Era},
     25   pages={64--69},
     26   year={2011},
     27   organization={ACM}
     28 }
     29 
     30 @article{meneses2015using,
     31   title={Using migratable objects to enhance fault tolerance schemes in
     32 		 supercomputers},
     33   author={Meneses, Esteban and Ni, Xiang and Zheng, Gengbin and Mendes, Celso L
     34 		  and Kale, Laxmikant V},
     35   journal={IEEE transactions on parallel and distributed systems},
     36   volume={26},
     37   number={7},
     38   pages={2061--2074},
     39   year={2015},
     40   publisher={IEEE}
     41 }
     42 
     43 @inproceedings{gankevich2015subordination,
     44   title={Subordination: Cluster management without distributed consensus},
     45   author={Gankevich, Ivan and Tipikin, Yuri and Gaiduchok, Vladimir},
     46   booktitle={High Performance Computing \& Simulation (HPCS), 2015
     47 			 International Conference on},
     48   pages={639--642},
     49   year={2015},
     50   organization={IEEE}
     51 }
     52 
     53 
     54 @inproceedings{gankevich2016factory,
     55   title={Factory: Non-stop batch jobs without checkpointing},
     56   author={Gankevich, Ivan and Tipikin, Yuri and Korkhov, Vladimir and
     57 		  Gaiduchok, Vladimir},
     58   booktitle={High Performance Computing \& Simulation (HPCS), 2016
     59 			 International Conference on},
     60   pages={979--984},
     61   year={2016},
     62   organization={IEEE}
     63 }
     64 
     65 
     66 @inproceedings{schroeder2007understanding,
     67   title={Understanding failures in petascale computers},
     68   author={Schroeder, Bianca and Gibson, Garth A},
     69   booktitle={Journal of Physics: Conference Series},
     70   volume={78},
     71   number={1},
     72   pages={12--22},
     73   year={2007},
     74   organization={IOP Publishing}
     75 }
     76 
     77 @inproceedings{robertson2000linux,
     78   title={{Linux-HA} Heartbeat System Design.},
     79   author={Robertson, Alan},
     80   booktitle={Proc. of 4\textsuperscript{th} Annual Linux Showcase \&
     81 			 Conference},
     82   year={2000},
     83   pages={305--316},
     84   address={Atlanta, Georgia},
     85   organization={USENIX},
     86   url={http://static.usenix.org/publications/library/proceedings/als00/2000papers/papers/full_papers/robertson/robertson_html/}
     87 }
     88 
     89 @article{haddad2003ha,
     90   title={{HA-OSCAR}: the birth of highly available {OSCAR}},
     91   author={Haddad, Ibrahim and Leangsuksun, Chokchai and Scott, Stephen L},
     92   journal={Linux Journal},
     93   volume={2003},
     94   number={115},
     95   pages={1},
     96   year={2003},
     97   publisher={Belltown Media}
     98 }
     99 
    100 @article{leangsuksun2005achieving,
    101   title={Achieving high availability and performance computing with an HA-OSCAR
    102 		 cluster},
    103   author={Leangsuksun, Chokchai Box and Shen, Lixin and Liu, Tong and Scott,
    104 		  Stephen L},
    105   journal={Future Generation Computer Systems},
    106   volume={21},
    107   number={4},
    108   pages={597--606},
    109   year={2005},
    110   publisher={Elsevier}
    111 }
    112 
    113 @misc{factoryGithub,
    114 	title={Factory: A framework for distributed computing},
    115 	author={Ivan Gankevich and Yuri Tipikin},
    116 	howpublished={\url{https://igankevich.github.io/factory/index.html}}
    117 }