BibTex format
@inproceedings{Picciau:2017:10.1109/HiPC.2016.030,
author = {Picciau, A and Inggs, G and Wickerson, J and Kerrigan, E and Constantinides, GA},
doi = {10.1109/HiPC.2016.030},
pages = {183--192},
publisher = {IEEE},
title = {Balancing locality and concurrency: solving sparse triangular systems on GPUs},
url = {http://dx.doi.org/10.1109/HiPC.2016.030},
year = {2017}
}
RIS format (EndNote, RefMan)
TY - CPAPER
AB - Many numerical optimisation problems rely onfast algorithms for solving sparse triangular systems of linearequations (STLs). To accelerate the solution of such equations,two types of approaches have been used: on GPUs, concurrencyhas been prioritised to the disadvantage of data locality, whileon multi-core CPUs, data locality has been prioritised to thedisadvantage of concurrency.In this paper, we discuss the interaction between data localityand concurrency in the solution of STLs on GPUs, and we presenta new algorithm that balances both. We demonstrate empiricallythat, subject to there being enough concurrency available in theinput matrix, our algorithm outperforms Nvidia’s concurrencyprioritisingCUSPARSE algorithm for GPUs. Experimental resultsshow a maximum speedup of 5.8-fold.Our solution algorithm, which we have implemented inOpenCL, requires a pre-processing phase that partitions thegraph associated with the input matrix into sub-graphs, whosedata can be stored in low-latency local memories. This preliminaryanalysis phase is expensive, but because it depends onlyon the input matrix, its cost can be amortised when solving formany different right-hand sides.
AU - Picciau,A
AU - Inggs,G
AU - Wickerson,J
AU - Kerrigan,E
AU - Constantinides,GA
DO - 10.1109/HiPC.2016.030
EP - 192
PB - IEEE
PY - 2017///
SP - 183
TI - Balancing locality and concurrency: solving sparse triangular systems on GPUs
UR - http://dx.doi.org/10.1109/HiPC.2016.030
UR - http://hdl.handle.net/10044/1/40611
ER -