libstdc++
balanced_quicksort.h
Go to the documentation of this file.
1 // -*- C++ -*-
2 
3 // Copyright (C) 2007, 2008, 2009 Free Software Foundation, Inc.
4 //
5 // This file is part of the GNU ISO C++ Library. This library is free
6 // software; you can redistribute it and/or modify it under the terms
7 // of the GNU General Public License as published by the Free Software
8 // Foundation; either version 3, or (at your option) any later
9 // version.
10 
11 // This library is distributed in the hope that it will be useful, but
12 // WITHOUT ANY WARRANTY; without even the implied warranty of
13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 // General Public License for more details.
15 
16 // Under Section 7 of GPL version 3, you are granted additional
17 // permissions described in the GCC Runtime Library Exception, version
18 // 3.1, as published by the Free Software Foundation.
19 
20 // You should have received a copy of the GNU General Public License and
21 // a copy of the GCC Runtime Library Exception along with this program;
22 // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23 // <http://www.gnu.org/licenses/>.
24 
25 /** @file parallel/balanced_quicksort.h
26  * @brief Implementation of a dynamically load-balanced parallel quicksort.
27  *
28  * It works in-place and needs only logarithmic extra memory.
29  * The algorithm is similar to the one proposed in
30  *
31  * P. Tsigas and Y. Zhang.
32  * A simple, fast parallel implementation of quicksort and
33  * its performance evaluation on SUN enterprise 10000.
34  * In 11th Euromicro Conference on Parallel, Distributed and
35  * Network-Based Processing, page 372, 2003.
36  *
37  * This file is a GNU parallel extension to the Standard C++ Library.
38  */
39 
40 // Written by Johannes Singler.
41 
42 #ifndef _GLIBCXX_PARALLEL_BALANCED_QUICKSORT_H
43 #define _GLIBCXX_PARALLEL_BALANCED_QUICKSORT_H 1
44 
46 #include <bits/stl_algo.h>
47 
48 #include <parallel/settings.h>
49 #include <parallel/partition.h>
50 #include <parallel/random_number.h>
51 #include <parallel/queue.h>
52 #include <functional>
53 
54 #if _GLIBCXX_ASSERTIONS
55 #include <parallel/checkers.h>
56 #endif
57 
58 namespace __gnu_parallel
59 {
60 /** @brief Information local to one thread in the parallel quicksort run. */
61 template<typename RandomAccessIterator>
63  {
65  typedef typename traits_type::difference_type difference_type;
66 
67  /** @brief Continuous part of the sequence, described by an
68  iterator pair. */
70 
71  /** @brief Initial piece to work on. */
73 
74  /** @brief Work-stealing queue. */
76 
77  /** @brief Number of threads involved in this algorithm. */
79 
80  /** @brief Pointer to a counter of elements left over to sort. */
81  volatile difference_type* elements_leftover;
82 
83  /** @brief The complete sequence to sort. */
85 
86  /** @brief Constructor.
87  * @param queue_size Size of the work-stealing queue. */
88  QSBThreadLocal(int queue_size) : leftover_parts(queue_size) { }
89  };
90 
91 /** @brief Balanced quicksort divide step.
92  * @param begin Begin iterator of subsequence.
93  * @param end End iterator of subsequence.
94  * @param comp Comparator.
95  * @param num_threads Number of threads that are allowed to work on
96  * this part.
97  * @pre @c (end-begin)>=1 */
98 template<typename RandomAccessIterator, typename Comparator>
99  typename std::iterator_traits<RandomAccessIterator>::difference_type
100  qsb_divide(RandomAccessIterator begin, RandomAccessIterator end,
101  Comparator comp, thread_index_t num_threads)
102  {
103  _GLIBCXX_PARALLEL_ASSERT(num_threads > 0);
104 
105  typedef std::iterator_traits<RandomAccessIterator> traits_type;
106  typedef typename traits_type::value_type value_type;
107  typedef typename traits_type::difference_type difference_type;
108 
109  RandomAccessIterator pivot_pos =
110  median_of_three_iterators(begin, begin + (end - begin) / 2,
111  end - 1, comp);
112 
113 #if defined(_GLIBCXX_ASSERTIONS)
114  // Must be in between somewhere.
115  difference_type n = end - begin;
116 
117  _GLIBCXX_PARALLEL_ASSERT(
118  (!comp(*pivot_pos, *begin) && !comp(*(begin + n / 2), *pivot_pos))
119  || (!comp(*pivot_pos, *begin) && !comp(*(end - 1), *pivot_pos))
120  || (!comp(*pivot_pos, *(begin + n / 2)) && !comp(*begin, *pivot_pos))
121  || (!comp(*pivot_pos, *(begin + n / 2)) && !comp(*(end - 1), *pivot_pos))
122  || (!comp(*pivot_pos, *(end - 1)) && !comp(*begin, *pivot_pos))
123  || (!comp(*pivot_pos, *(end - 1)) && !comp(*(begin + n / 2), *pivot_pos)));
124 #endif
125 
126  // Swap pivot value to end.
127  if (pivot_pos != (end - 1))
128  std::swap(*pivot_pos, *(end - 1));
129  pivot_pos = end - 1;
130 
132  pred(comp, *pivot_pos);
133 
134  // Divide, returning end - begin - 1 in the worst case.
135  difference_type split_pos = parallel_partition(
136  begin, end - 1, pred, num_threads);
137 
138  // Swap back pivot to middle.
139  std::swap(*(begin + split_pos), *pivot_pos);
140  pivot_pos = begin + split_pos;
141 
142 #if _GLIBCXX_ASSERTIONS
143  RandomAccessIterator r;
144  for (r = begin; r != pivot_pos; ++r)
145  _GLIBCXX_PARALLEL_ASSERT(comp(*r, *pivot_pos));
146  for (; r != end; ++r)
147  _GLIBCXX_PARALLEL_ASSERT(!comp(*r, *pivot_pos));
148 #endif
149 
150  return split_pos;
151  }
152 
153 /** @brief Quicksort conquer step.
154  * @param tls Array of thread-local storages.
155  * @param begin Begin iterator of subsequence.
156  * @param end End iterator of subsequence.
157  * @param comp Comparator.
158  * @param iam Number of the thread processing this function.
159  * @param num_threads
160  * Number of threads that are allowed to work on this part. */
161 template<typename RandomAccessIterator, typename Comparator>
162  void
164  RandomAccessIterator begin, RandomAccessIterator end,
165  Comparator comp,
166  thread_index_t iam, thread_index_t num_threads,
167  bool parent_wait)
168  {
169  typedef std::iterator_traits<RandomAccessIterator> traits_type;
170  typedef typename traits_type::value_type value_type;
171  typedef typename traits_type::difference_type difference_type;
172 
173  difference_type n = end - begin;
174 
175  if (num_threads <= 1 || n <= 1)
176  {
177  tls[iam]->initial.first = begin;
178  tls[iam]->initial.second = end;
179 
180  qsb_local_sort_with_helping(tls, comp, iam, parent_wait);
181 
182  return;
183  }
184 
185  // Divide step.
186  difference_type split_pos = qsb_divide(begin, end, comp, num_threads);
187 
188 #if _GLIBCXX_ASSERTIONS
189  _GLIBCXX_PARALLEL_ASSERT(0 <= split_pos && split_pos < (end - begin));
190 #endif
191 
192  thread_index_t num_threads_leftside =
193  std::max<thread_index_t>(1, std::min<thread_index_t>(
194  num_threads - 1, split_pos * num_threads / n));
195 
196 # pragma omp atomic
197  *tls[iam]->elements_leftover -= (difference_type)1;
198 
199  // Conquer step.
200 # pragma omp parallel num_threads(2)
201  {
202  bool wait;
203  if(omp_get_num_threads() < 2)
204  wait = false;
205  else
206  wait = parent_wait;
207 
208 # pragma omp sections
209  {
210 # pragma omp section
211  {
212  qsb_conquer(tls, begin, begin + split_pos, comp,
213  iam,
214  num_threads_leftside,
215  wait);
216  wait = parent_wait;
217  }
218  // The pivot_pos is left in place, to ensure termination.
219 # pragma omp section
220  {
221  qsb_conquer(tls, begin + split_pos + 1, end, comp,
222  iam + num_threads_leftside,
223  num_threads - num_threads_leftside,
224  wait);
225  wait = parent_wait;
226  }
227  }
228  }
229  }
230 
231 /**
232  * @brief Quicksort step doing load-balanced local sort.
233  * @param tls Array of thread-local storages.
234  * @param comp Comparator.
235  * @param iam Number of the thread processing this function.
236  */
237 template<typename RandomAccessIterator, typename Comparator>
238  void
240  Comparator& comp, int iam, bool wait)
241  {
242  typedef std::iterator_traits<RandomAccessIterator> traits_type;
243  typedef typename traits_type::value_type value_type;
244  typedef typename traits_type::difference_type difference_type;
246 
247  QSBThreadLocal<RandomAccessIterator>& tl = *tls[iam];
248 
249  difference_type base_case_n =
251  if (base_case_n < 2)
252  base_case_n = 2;
253  thread_index_t num_threads = tl.num_threads;
254 
255  // Every thread has its own random number generator.
256  random_number rng(iam + 1);
257 
258  Piece current = tl.initial;
259 
260  difference_type elements_done = 0;
261 #if _GLIBCXX_ASSERTIONS
262  difference_type total_elements_done = 0;
263 #endif
264 
265  for (;;)
266  {
267  // Invariant: current must be a valid (maybe empty) range.
268  RandomAccessIterator begin = current.first, end = current.second;
269  difference_type n = end - begin;
270 
271  if (n > base_case_n)
272  {
273  // Divide.
274  RandomAccessIterator pivot_pos = begin + rng(n);
275 
276  // Swap pivot_pos value to end.
277  if (pivot_pos != (end - 1))
278  std::swap(*pivot_pos, *(end - 1));
279  pivot_pos = end - 1;
280 
282  <Comparator, value_type, value_type, bool>
283  pred(comp, *pivot_pos);
284 
285  // Divide, leave pivot unchanged in last place.
286  RandomAccessIterator split_pos1, split_pos2;
287  split_pos1 = __gnu_sequential::partition(begin, end - 1, pred);
288 
289  // Left side: < pivot_pos; right side: >= pivot_pos.
290 #if _GLIBCXX_ASSERTIONS
291  _GLIBCXX_PARALLEL_ASSERT(begin <= split_pos1 && split_pos1 < end);
292 #endif
293  // Swap pivot back to middle.
294  if (split_pos1 != pivot_pos)
295  std::swap(*split_pos1, *pivot_pos);
296  pivot_pos = split_pos1;
297 
298  // In case all elements are equal, split_pos1 == 0.
299  if ((split_pos1 + 1 - begin) < (n >> 7)
300  || (end - split_pos1) < (n >> 7))
301  {
302  // Very unequal split, one part smaller than one 128th
303  // elements not strictly larger than the pivot.
305  <Comparator, value_type, value_type, bool>, value_type>
307  <Comparator, value_type, value_type, bool>(comp,
308  *pivot_pos));
309 
310  // Find other end of pivot-equal range.
311  split_pos2 = __gnu_sequential::partition(split_pos1 + 1,
312  end, pred);
313  }
314  else
315  // Only skip the pivot.
316  split_pos2 = split_pos1 + 1;
317 
318  // Elements equal to pivot are done.
319  elements_done += (split_pos2 - split_pos1);
320 #if _GLIBCXX_ASSERTIONS
321  total_elements_done += (split_pos2 - split_pos1);
322 #endif
323  // Always push larger part onto stack.
324  if (((split_pos1 + 1) - begin) < (end - (split_pos2)))
325  {
326  // Right side larger.
327  if ((split_pos2) != end)
328  tl.leftover_parts.push_front(std::make_pair(split_pos2,
329  end));
330 
331  //current.first = begin; //already set anyway
332  current.second = split_pos1;
333  continue;
334  }
335  else
336  {
337  // Left side larger.
338  if (begin != split_pos1)
339  tl.leftover_parts.push_front(std::make_pair(begin,
340  split_pos1));
341 
342  current.first = split_pos2;
343  //current.second = end; //already set anyway
344  continue;
345  }
346  }
347  else
348  {
349  __gnu_sequential::sort(begin, end, comp);
350  elements_done += n;
351 #if _GLIBCXX_ASSERTIONS
352  total_elements_done += n;
353 #endif
354 
355  // Prefer own stack, small pieces.
356  if (tl.leftover_parts.pop_front(current))
357  continue;
358 
359 # pragma omp atomic
360  *tl.elements_leftover -= elements_done;
361 
362  elements_done = 0;
363 
364 #if _GLIBCXX_ASSERTIONS
365  double search_start = omp_get_wtime();
366 #endif
367 
368  // Look for new work.
369  bool successfully_stolen = false;
370  while (wait && *tl.elements_leftover > 0 && !successfully_stolen
372  // Possible dead-lock.
373  && (omp_get_wtime() < (search_start + 1.0))
374 #endif
375  )
376  {
377  thread_index_t victim;
378  victim = rng(num_threads);
379 
380  // Large pieces.
381  successfully_stolen = (victim != iam)
382  && tls[victim]->leftover_parts.pop_back(current);
383  if (!successfully_stolen)
384  yield();
385 #if !defined(__ICC) && !defined(__ECC)
386 # pragma omp flush
387 #endif
388  }
389 
390 #if _GLIBCXX_ASSERTIONS
391  if (omp_get_wtime() >= (search_start + 1.0))
392  {
393  sleep(1);
394  _GLIBCXX_PARALLEL_ASSERT(omp_get_wtime()
395  < (search_start + 1.0));
396  }
397 #endif
398  if (!successfully_stolen)
399  {
400 #if _GLIBCXX_ASSERTIONS
401  _GLIBCXX_PARALLEL_ASSERT(*tl.elements_leftover == 0);
402 #endif
403  return;
404  }
405  }
406  }
407  }
408 
409 /** @brief Top-level quicksort routine.
410  * @param begin Begin iterator of sequence.
411  * @param end End iterator of sequence.
412  * @param comp Comparator.
413  * @param num_threads Number of threads that are allowed to work on
414  * this part.
415  */
416 template<typename RandomAccessIterator, typename Comparator>
417  void
418  parallel_sort_qsb(RandomAccessIterator begin, RandomAccessIterator end,
419  Comparator comp,
420  thread_index_t num_threads)
421  {
422  _GLIBCXX_CALL(end - begin)
423 
424  typedef std::iterator_traits<RandomAccessIterator> traits_type;
425  typedef typename traits_type::value_type value_type;
426  typedef typename traits_type::difference_type difference_type;
428 
429  typedef QSBThreadLocal<RandomAccessIterator> tls_type;
430 
431  difference_type n = end - begin;
432 
433  if (n <= 1)
434  return;
435 
436  // At least one element per processor.
437  if (num_threads > n)
438  num_threads = static_cast<thread_index_t>(n);
439 
440  // Initialize thread local storage
441  tls_type** tls = new tls_type*[num_threads];
442  difference_type queue_size = num_threads * (thread_index_t)(log2(n) + 1);
443  for (thread_index_t t = 0; t < num_threads; ++t)
444  tls[t] = new QSBThreadLocal<RandomAccessIterator>(queue_size);
445 
446  // There can never be more than ceil(log2(n)) ranges on the stack, because
447  // 1. Only one processor pushes onto the stack
448  // 2. The largest range has at most length n
449  // 3. Each range is larger than half of the range remaining
450  volatile difference_type elements_leftover = n;
451  for (int i = 0; i < num_threads; ++i)
452  {
453  tls[i]->elements_leftover = &elements_leftover;
454  tls[i]->num_threads = num_threads;
455  tls[i]->global = std::make_pair(begin, end);
456 
457  // Just in case nothing is left to assign.
458  tls[i]->initial = std::make_pair(end, end);
459  }
460 
461  // Main recursion call.
462  qsb_conquer(tls, begin, begin + n, comp, 0, num_threads, true);
463 
464 #if _GLIBCXX_ASSERTIONS
465  // All stack must be empty.
466  Piece dummy;
467  for (int i = 1; i < num_threads; ++i)
468  _GLIBCXX_PARALLEL_ASSERT(!tls[i]->leftover_parts.pop_back(dummy));
469 #endif
470 
471  for (int i = 0; i < num_threads; ++i)
472  delete tls[i];
473  delete[] tls;
474  }
475 } // namespace __gnu_parallel
476 
477 #endif /* _GLIBCXX_PARALLEL_BALANCED_QUICKSORT_H */
static const _Settings & get()
Get the global settings.
Parallel implementation of std::partition(), std::nth_element(), and std::partial_sort(). This file is a GNU parallel extension to the Standard C++ Library.
thread_index_t num_threads
Number of threads involved in this algorithm.
void qsb_conquer(QSBThreadLocal< RandomAccessIterator > **tls, RandomAccessIterator begin, RandomAccessIterator end, Comparator comp, thread_index_t iam, thread_index_t num_threads, bool parent_wait)
Quicksort conquer step.
std::iterator_traits< RandomAccessIterator >::difference_type parallel_partition(RandomAccessIterator begin, RandomAccessIterator end, Predicate pred, thread_index_t num_threads)
Parallel implementation of std::partition.
Definition: partition.h:55
Subsequence description.
std::pair< RandomAccessIterator, RandomAccessIterator > Piece
Continuous part of the sequence, described by an iterator pair.
std::iterator_traits< RandomAccessIterator >::difference_type qsb_divide(RandomAccessIterator begin, RandomAccessIterator end, Comparator comp, thread_index_t num_threads)
Balanced quicksort divide step.
#define _GLIBCXX_ASSERTIONS
Switch on many _GLIBCXX_PARALLEL_ASSERTions in parallel code. Should be switched on only locally...
Similar to std::binder1st, but giving the argument types explicitly.
Definition: base.h:177
Similar to std::binder1st, but giving the argument types explicitly.
Definition: base.h:196
Random number generator based on the Mersenne twister. This file is a GNU parallel extension to the S...
uint16 thread_index_t
Unsigned integer to index a thread number. The maximum thread number (for each processor) must fit in...
Definition: types.h:141
sequence_index_t sort_qsb_base_case_maximal_n
Maximal subsequence length to switch to unbalanced base case. Applies to std::sort with dynamically l...
Definition: settings.h:236
Runtime settings and tuning parameters, heuristics to decide whether to use parallelized algorithms...
void yield()
Yield the control to another thread, without waiting for the end to the time slice.
volatile difference_type * elements_leftover
Pointer to a counter of elements left over to sort.
Lock-free double-ended queue. This file is a GNU parallel extension to the Standard C++ Library...
RandomAccessIterator median_of_three_iterators(RandomAccessIterator a, RandomAccessIterator b, RandomAccessIterator c, Comparator &comp)
Compute the median of three referenced elements, according to comp.
Definition: base.h:443
Random number generator, based on the Mersenne twister.
Definition: random_number.h:41
Information local to one thread in the parallel quicksort run.
Piece global
The complete sequence to sort.
Double-ended queue of bounded size, allowing lock-free atomic access. push_front() and pop_front() mu...
Definition: queue.h:52
void qsb_local_sort_with_helping(QSBThreadLocal< RandomAccessIterator > **tls, Comparator &comp, int iam, bool wait)
Quicksort step doing load-balanced local sort.
RestrictedBoundedConcurrentQueue< Piece > leftover_parts
Work-stealing queue.
void parallel_sort_qsb(RandomAccessIterator begin, RandomAccessIterator end, Comparator comp, thread_index_t num_threads)
Top-level quicksort routine.
QSBThreadLocal(int queue_size)
Constructor.
Includes the original header files concerned with iterators except for stream iterators. This file is a GNU parallel extension to the Standard C++ Library.
#define _GLIBCXX_CALL(n)
Macro to produce log message when entering a function.
Similar to std::binder2nd, but giving the argument types explicitly.
Definition: base.h:225
Routines for checking the correctness of algorithm results. This file is a GNU parallel extension to ...
Piece initial
Initial piece to work on.