6namespace ParticleBinning {
8 template <typename BunchType, typename BinningSelector>
9 void AdaptBins<BunchType, BinningSelector>::initLimits() {
10 Inform msg("AdaptBins"); // INFORM_ALL_NODES
12 // Update needed if bunch->create() is called between binnings!
13 var_selector_m.updateDataArr(bunch_m);
14 BinningSelector var_selector = var_selector_m;
15 size_type nlocal = bunch_m->getLocalNum();
17 // Start limit reduction if number of particles is big enough.
18 IpplTimings::startTimer(bInitLimitsT);
20 msg << "Particles in the bunch = " << nlocal << ". Overwriting limits manually." << endl;
21 xMin_m = xMax_m = (nlocal == 0) ? 0 : 0;
23 Kokkos::MinMaxScalar<value_type> localMinMax;
24 // Sadly this is necessary, since Kokkos seems to have a problem when nlocal == 1, where it does not update localMinMax
26 // Access and copy values explicitly
27 Kokkos::View<value_type, Kokkos::HostSpace> host_scalar("host_scalar");
28 Kokkos::View<value_type> tmp_dvalue("tmp_dvalue");
29 Kokkos::parallel_for("tmp_dvalue", 1, KOKKOS_LAMBDA(const size_type) { tmp_dvalue() = var_selector(0); });
30 Kokkos::deep_copy(host_scalar, tmp_dvalue);
31 localMinMax.max_val = localMinMax.min_val = host_scalar();
33 Kokkos::parallel_reduce("localBinLimitReduction", nlocal,
34 KOKKOS_LAMBDA(const size_type i, Kokkos::MinMaxScalar<value_type>& update) {
35 value_type val = var_selector(i);
36 update.min_val = Kokkos::min(update.min_val, val);
37 update.max_val = Kokkos::max(update.max_val, val);
38 }, Kokkos::MinMax<value_type>(localMinMax));
40 xMin_m = localMinMax.min_val;
41 xMax_m = localMinMax.max_val;
43 IpplTimings::stopTimer(bInitLimitsT);
45 IpplTimings::startTimer(bAllReduceLimitsT);
46 ippl::Comm->allreduce(xMax_m, 1, std::greater<value_type>());
47 ippl::Comm->allreduce(xMin_m, 1, std::less<value_type>());
48 IpplTimings::stopTimer(bAllReduceLimitsT);
50 // Update bin width variable with new limits
51 binWidth_m = (xMax_m - xMin_m) / currentBins_m;
52 msg << "Initialized limits. Min: " << xMin_m << ", max: " << xMax_m << ", binWidth: " << binWidth_m << endl;
56 template <typename BunchType, typename BinningSelector>
57 void AdaptBins<BunchType, BinningSelector>::instantiateHistogram(bool setToZero) {
58 // Reinitialize the histogram view with the new size (numBins)
59 const bin_index_type numBins = getCurrentBinCount();
60 localBinHisto_m = d_histo_type("localBinHisto_m", numBins, xMax_m - xMin_m,
61 binningAlpha_m, binningBeta_m, desiredWidth_m);
63 // Optionally, initialize the histogram to zero
65 dview_type device_histo = localBinHisto_m.template getDeviceView<dview_type>(localBinHisto_m.getHistogram());
66 Kokkos::deep_copy(device_histo, 0);
68 // Sync changed (only has an effect if DualView mode is used and the two memory spaces are different!)
69 localBinHisto_m.modify_device();
70 localBinHisto_m.sync();
75 template <typename BunchType, typename BinningSelector>
76 KOKKOS_INLINE_FUNCTION typename AdaptBins<BunchType, BinningSelector>::bin_index_type
77 AdaptBins<BunchType, BinningSelector>::getBin(value_type x, value_type xMin, value_type xMax, value_type binWidthInv, bin_index_type numBins) {
78 // Explanation: Don't access xMin, binWidth, ... through the members to avoid implicit
79 // variable capture by Kokkos and potential copying overhead. Instead, pass them as an
80 // argument, s.t. Kokkos can capture them explicitly!
81 // Make it static to avoid implicit capture of this inside Kokkos lambda!
83 // Ensure x is within bounds (clamp it between xMin and xMax --> this is only for bin assignment).
84 x += (x < xMin) * (xMin - x) + (x > xMax) * (xMax - x);
86 bin_index_type bin = (x - xMin) * binWidthInv;
87 return (bin >= numBins) ? (numBins - 1) : bin; // Clamp to the maximum bin if "out of bounds".
88 // Might happen if binning is called before initLimits()...
92 template <typename BunchType, typename BinningSelector>
93 void AdaptBins<BunchType, BinningSelector>::assignBinsToParticles() {
94 Inform msg("AdaptBins");
96 var_selector_m.updateDataArr(bunch_m);
97 BinningSelector var_selector = var_selector_m;
98 bin_view_type binIndex = getBinView();
100 IpplTimings::startTimer(bAssignUniformBinsT);
101 if (bunch_m->getLocalNum() <= 1) {
102 msg << "Too few bins, assigning all bins to index 0." << endl;
103 Kokkos::deep_copy(binIndex, 0);
107 // Declare the variables locally before the Kokkos::parallel_for (to avoid implicit this capture in Kokkos lambda)
108 value_type xMin = xMin_m, xMax = xMax_m, binWidthInv = 1.0/binWidth_m;
109 bin_index_type numBins = currentBins_m;
111 // Assign the bin index to the particle
112 Kokkos::parallel_for("assignParticleBinsConst", bunch_m->getLocalNum(), KOKKOS_LAMBDA(const size_type& i) {
113 value_type v = var_selector(i);
115 bin_index_type bin = getBin(v, xMin, xMax, binWidthInv, numBins);
118 IpplTimings::stopTimer(bAssignUniformBinsT);
120 msg << "All bins assigned." << endl;
124 template<typename BunchType, typename BinningSelector>
125 template<typename ReducerType>
126 void AdaptBins<BunchType, BinningSelector>::executeInitLocalHistoReduction(ReducerType& to_reduce) {
127 bin_view_type binIndex = getBinView();
128 dview_type device_histo = localBinHisto_m.template getDeviceView<dview_type>(localBinHisto_m.getHistogram());
129 bin_index_type binCount = getCurrentBinCount();
131 // Reduce using an array-reducer object (size needs to be known at compile time for GPU kernels)
132 Kokkos::parallel_reduce("initLocalHist", bunch_m->getLocalNum(),
133 KOKKOS_LAMBDA(const size_type& i, ReducerType& update) {
134 bin_index_type ndx = binIndex(i); // Determine the bin index for this particle
135 update.the_array[ndx]++; // Increment the corresponding bin count in the reduction array
136 }, Kokkos::Sum<ReducerType>(to_reduce)
139 // Copy the reduced results to the final histogram (into the BinHisto class instance)
140 Kokkos::parallel_for("finalize_histogram", binCount,
141 KOKKOS_LAMBDA(const bin_index_type& i) {
142 device_histo(i) = to_reduce.the_array[i];
146 // Note, sync is called inside initLocalHiso, not here
147 localBinHisto_m.modify_device();
151 template <typename BunchType, typename BinningSelector>
152 void AdaptBins<BunchType, BinningSelector>::executeInitLocalHistoReductionTeamFor() {
153 bin_view_type binIndex = getBinView();
154 dview_type device_histo = localBinHisto_m.template getDeviceView<dview_type>(localBinHisto_m.getHistogram());
155 const bin_index_type binCount = getCurrentBinCount();
156 const size_type localNumParticles = bunch_m->getLocalNum();
158 using team_policy = Kokkos::TeamPolicy<>;
159 using member_type = team_policy::member_type;
161 // Define a scratch space view type
162 using scratch_view_type = Kokkos::View<size_type*, Kokkos::DefaultExecutionSpace::scratch_memory_space, Kokkos::MemoryTraits<Kokkos::Unmanaged>>;
164 // Calculate shared memory size for the histogram (binCount elements)
165 const size_t shared_size = scratch_view_type::shmem_size(binCount);
167 const size_type team_size = 128;
168 const size_type block_size = team_size * 8;
169 const size_type num_leagues = (localNumParticles + block_size - 1) / block_size; // number of teams!
171 // Set up team policy with scratch memory allocation for each team
172 team_policy policy(num_leagues, team_size);
173 policy = policy.set_scratch_size(0, Kokkos::PerTeam(shared_size));
175 // Launch a team parallel_for with the scratch memory setup
176 Kokkos::parallel_for("initLocalHist", policy, KOKKOS_LAMBDA(const member_type& teamMember) {
177 // Allocate team-local histogram in scratch memory
178 scratch_view_type team_local_hist(teamMember.team_scratch(0), binCount);
180 // Initialize shared memory histogram to zero
181 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, binCount), [&](const bin_index_type b) {
182 team_local_hist(b) = 0;
184 teamMember.team_barrier();
186 const size_type start_i = teamMember.league_rank() * block_size;
187 const size_type end_i = Kokkos::min(start_i + block_size, localNumParticles);
189 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, start_i, end_i), [&](const size_type i) {
190 bin_index_type ndx = binIndex(i); // Get bin index for the particle
191 if (ndx < binCount) Kokkos::atomic_increment(&team_local_hist(ndx)); // Atomic within shared memory
193 teamMember.team_barrier();
195 // Reduce the team-local histogram into global memory
196 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, binCount), [&](const bin_index_type i) {
197 Kokkos::atomic_add(&device_histo(i), team_local_hist(i));
201 localBinHisto_m.modify_device();
205 template <typename BunchType, typename BinningSelector>
206 void AdaptBins<BunchType, BinningSelector>::initLocalHisto(HistoReductionMode modePreference) {
207 Inform msg("AdaptBins");
209 bin_index_type binCount = getCurrentBinCount();
211 // Determine the execution method based on the bin count and the mode...
212 HistoReductionMode mode = determineHistoReductionMode(modePreference, binCount);
214 IpplTimings::startTimer(bExecuteHistoReductionT);
215 if (mode == HistoReductionMode::HostOnly) {
216 msg << "Using host-only parallel_reduce reduction." << endl;
217 HostArrayReduction<size_type, bin_index_type>::binCountStatic = binCount;
218 HostArrayReduction<size_type, bin_index_type> reducer_arr;
219 executeInitLocalHistoReduction(reducer_arr);
220 } else if (mode == HistoReductionMode::TeamBased) {
221 msg << "Using team-based + atomic reduction." << endl;
222 executeInitLocalHistoReductionTeamFor();
223 } else if (mode == HistoReductionMode::ParallelReduce) {
224 auto to_reduce = createReductionObject<size_type, bin_index_type>(binCount);
225 std::visit([&](auto& reducer_arr) {
226 msg << "Starting parallel_reduce, array size = " << sizeof(reducer_arr.the_array) / sizeof(reducer_arr.the_array[0]) << endl;
227 executeInitLocalHistoReduction(reducer_arr);
230 msg << "No valid execution method defined to initialize local histogram for energy binning." << endl;
231 ippl::Comm->abort(); // Exit, since histogram cannot be built (should not happen...)!
233 IpplTimings::stopTimer(bExecuteHistoReductionT);
235 msg << "Reducer ran without error." << endl;
237 localBinHisto_m.sync(); // since all reductions happen on device --> sync changes if DualView mode is used
241 template <typename BunchType, typename BinningSelector>
242 void AdaptBins<BunchType, BinningSelector>::initGlobalHistogram() {
243 Inform msg("AdaptBins");
245 // Number of local bins == number of global bins!
246 // Note that this step only happens after the uniform histogram is computed.
247 bin_index_type numBins = getCurrentBinCount();
249 // Create a view to hold the global histogram on all ranks
250 globalBinHisto_m = h_histo_type_g("globalBinHisto_m", numBins, xMax_m - xMin_m,
251 binningAlpha_m, binningBeta_m, desiredWidth_m);
253 // Get host ("mirror" <-- depends on mode of BinHisto class) view of histograms --> reduce on host!
254 hview_type localBinHistoHost = localBinHisto_m.template getHostView<hview_type>(localBinHisto_m.getHistogram());
255 hview_type_g globalBinHistoHost = globalBinHisto_m.template getHostView<hview_type_g>(globalBinHisto_m.getHistogram());
258 // Note: The allreduce also works when the .data() returns a CUDA space pointer.
259 // However, for some reason, copying manually to host and then allreduce-ing is faster.
260 IpplTimings::startTimer(bAllReduceGlobalHistoT);
261 ippl::Comm->allreduce(
262 localBinHistoHost.data(),
263 globalBinHistoHost.data(),
265 std::plus<size_type>()
267 IpplTimings::stopTimer(bAllReduceGlobalHistoT);
269 // The global histogram is currently on host --> sync to device if used as DualView.
270 // Note that the global histogram should be used in a host-only mode, so these calls
271 // don't have an effect!
272 globalBinHisto_m.modify_host();
273 globalBinHisto_m.sync();
275 msg << "Global histogram created." << endl;
279 template <typename BunchType, typename BinningSelector>
280 void AdaptBins<BunchType, BinningSelector>::genAdaptiveHistogram() {
281 var_selector_m.updateDataArr(bunch_m); // Probably not necessary, since it is called before updating particles; but just in case!
282 bin_view_type binIndex = getBinView();
283 hindex_transform_type adaptLookup = globalBinHisto_m.mergeBins();
285 // This step is currently unnecessary, since mergeBins works on host only. However, just in case, we can keep this
286 // if should become available on device as well (there is barely a performance hit, since the lookup array is usually
287 // only 128 elements large).
288 dindex_transform_type adaptLookupDevice = dindex_transform_type("adaptLookupDevice", currentBins_m);
289 Kokkos::deep_copy(adaptLookupDevice, adaptLookup);
291 setCurrentBinCount(globalBinHisto_m.getCurrentBinCount());
293 // Map old indices to the new histogram ("Rebin")
294 Kokkos::parallel_for("RebinParticles", bunch_m->getLocalNum(), KOKKOS_LAMBDA(const size_type& i) {
295 bin_index_type oldBin = binIndex(i);
296 binIndex(i) = adaptLookupDevice(oldBin);
299 // Update local histogram with new indices
300 instantiateHistogram(true);
301 initLocalHisto(); // Runs reducer on new bin indices (also does the sync).
303 localBinHisto_m.initPostSum(); // Only init postsum, since the widths are not constant anymore.
304 localBinHisto_m.copyBinWidths(globalBinHisto_m); // Manually copy non-constant widths from global histogram.
307 template <typename BunchType, typename BinningSelector>
308 template <typename T, unsigned Dim>
309 VField_t<T, Dim>& AdaptBins<BunchType, BinningSelector>::LTrans(VField_t<T, Dim>& field, const bin_index_type& currentBin) {
310 Inform m("AdaptBins");
312 position_view_type P = bunch_m->P.getView();
313 hash_type indices = sortedIndexArr_m;
315 // Calculate gamma factor for field back transformation
316 // Note that P is saved normalized in OPAL, so technically p/mc
317 Vector<T, Dim> gamma_bin2(0.0);
318 Kokkos::parallel_reduce("CalculateGammaFactor", getBinIterationPolicy(currentBin),
319 KOKKOS_LAMBDA(const size_type& i, Vector<double, 3>& v) {
320 Vector<double, 3> v_comp = P(indices(i));
322 }, Kokkos::Sum<Vector<T, Dim>>(gamma_bin2));
323 bin_index_type npart_bin = getNPartInBin(currentBin);
326 * TODO Note: when the load balancer is not called often enough, then the adaptive bin
327 * can lead to a phenomenon where the number of particles in a bin is zero, which leads to
328 * a division by zero.
329 * So: either check if the number is 0 or make sure ranks always have enough particles!
331 gamma_bin2 = (npart_bin == 0) ? Vector<double, 3>(0.0) : gamma_bin2/npart_bin; // Now we have <P> for this bin
332 gamma_bin2 = -sqrt(1.0 + gamma_bin2*gamma_bin2); // in these units: gamma=sqrt(1 + <P>^2), assuming <P^2>~0 (since bunch per bin should be "considered constant") // -1.0 / sqrt(1.0 - gamma_bin2 / c2); // negative sign, since we want the inverse transformation
334 m << "Gamma(binIndex = " << currentBin << ") = -" << gamma_bin2 << endl;
336 // Next apply the transformation --> do it manually, since fc->E*gamma does not exist in IPPL...
337 ippl::parallel_for("TransformFieldWithVelocity", field.getFieldRangePolicy(),
338 KOKKOS_LAMBDA(const ippl::RangePolicy<Dim>::index_array_type& idx) {
339 apply(field, idx) *= gamma_bin2;
346 template <typename BunchType, typename BinningSelector>
347 void AdaptBins<BunchType, BinningSelector>::sortContainerByBin() {
349 * Assume, this function is called after the prefix sum is initialized.
350 * Then the particles need to be changed (sorted) in the right order and finally
351 * the range_policy can simply be retrieved from the prefix sum for the scatter().
352 * This is handled by the BinHisto class.
355 bin_view_type bins = getBinView();
356 size_type localNumParticles = bunch_m->getLocalNum();
357 size_type numBins = getCurrentBinCount();
358 dview_type bin_counts = localBinHisto_m.template getDeviceView<dview_type>(localBinHisto_m.getHistogram());
360 // Get post sum (already calculated with histogram and saved inside local_bin_histo_post_sum_m), use copy to not modify the original
361 Kokkos::View<size_type*> bin_offsets("bin_offsets", numBins + 1);
362 Kokkos::deep_copy(bin_offsets, localBinHisto_m.template getDeviceView<dview_type>(localBinHisto_m.getPostSum()));
364 // Initialize index array, use local shallow copy of the view
365 sortedIndexArr_m = hash_type("indices", localNumParticles);
366 hash_type indices = sortedIndexArr_m;
368 // Builds sorted index array using bin sort
369 IpplTimings::startTimer(bSortContainerByBinT);
370 Kokkos::parallel_for("SortIndices", localNumParticles, KOKKOS_LAMBDA(const size_type& i) {
371 size_type target_bin = bins(i);
372 size_type target_pos = Kokkos::atomic_fetch_add(&bin_offsets(target_bin), 1);
374 // Place the current particle directly into its target position
375 indices(target_pos) = i;
377 IpplTimings::stopTimer(bSortContainerByBinT);
379 // Test is sorting was successful (only in debug mode)
381 IpplTimings::startTimer(bVerifySortingT);
382 if (localNumParticles > 1 && !viewIsSorted<bin_index_type>(getBinView(), indices, localNumParticles)) {
383 Inform msg("AdaptBins");
384 msg << "Sorting failed." << endl;
387 IpplTimings::stopTimer(bVerifySortingT);
393#endif // ADAPT_BINS_HPP