Parallel memcpy in cpp
I am trying to copy a matrix in parallel. Below is the code that I am working with. Currently, it works as expected with char, but it seg faults when I use shorts. I assume that the bug is in copying outside of the memory outside of the vector. I have tried to debug my assumption without success.
CMakeLists.txt
cmake_minimum_required(VERSION 3.0)
project(memcpy CXX)
find_package (Threads)
add_executable(memcpy main.cpp)
set_property(TARGET memcpy PROPERTY CXX_STANDARD 17)
target_link_libraries (memcpy ${CMAKE_THREAD_LIBS_INIT})
main.cpp
#include <cassert>
#include <condition_variable>
#include <cstring>
#include <iostream>
#include <mutex>
#include <string>
#include <thread>
#include <vector>
class Barrier {
public:
explicit Barrier(std::size_t const count) : m_threshold(count), m_remaining(count), m_generation(0) {}
void wait() {
auto local = std::unique_lock<std::mutex>{m_mutex};
auto current_generation = m_generation;
m_remaining--;
if (!m_remaining) {
m_generation++;
m_remaining = m_threshold;
m_condition.notify_all();
} else {
m_condition.wait(local, [this, current_generation] { return current_generation != m_generation; });
}
}
private:
std::mutex m_mutex;
std::condition_variable m_condition;
std::size_t m_threshold;
std::size_t m_remaining;
std::size_t m_generation;
};
template <typename T>
class Matrix {
using reference = typename std::vector<T>::reference;
using const_reference = typename std::vector<T>::const_reference;
public:
Matrix(std::size_t rows, std::size_t cols) : m_rows(rows), m_cols(cols), m_data(m_cols * m_rows) {}
Matrix(std::size_t rows, std::size_t cols, T const& default_val) : m_rows(rows), m_cols(cols), m_data(m_cols * m_rows, default_val) {}
constexpr std::size_t get_columns() const { return m_cols; }
constexpr std::size_t get_rows() const { return m_rows; }
constexpr std::size_t get_element_count() const {
assert(m_cols * m_rows == m_data.size());
return m_cols * m_rows;
}
T* data() { return m_data.data(); }
T const* data() const { return m_data.data(); }
reference operator()(std::size_t const column_x, std::size_t const row_y) {
assert(0 <= column_x);
assert(column_x < get_columns());
assert(0 <= row_y);
assert(row_y < get_rows());
return m_data[row_y * m_cols + column_x];
}
const_reference operator()(std::size_t const column_x, std::size_t const row_y) const {
assert(0 <= column_x);
assert(column_x < get_columns());
assert(0 <= row_y);
assert(row_y < get_rows());
return m_data[row_y * m_cols + column_x];
}
private:
std::size_t const m_rows;
std::size_t const m_cols;
std::vector<T> m_data;
};
static_assert(false, "FIX ME");
using T = char;
// using T = short;
// using T = int;
// using T = double;
void run(std::size_t const my_rank, std::size_t const num_threads, Barrier& barrier, Matrix<T> const& from_data, Matrix<T>& to_data) {
auto n = from_data.get_element_count();
std::string str;
if (my_rank == 0) {
std::cerr << "bytes to copy: " << (n * sizeof(T)) << 'n';
}
// initialization
std::size_t segment_size = n / num_threads;
std::size_t start = (my_rank * segment_size) * sizeof(T);
std::size_t end = ((my_rank + 1) * segment_size) * sizeof(T);
std::size_t distance = end - start;
str += " my_rank: " + std::to_string(my_rank);
str += " segment_size: " + std::to_string(segment_size);
str += " start: " + std::to_string(start);
str += " end: " + std::to_string(end);
str += " distance: " + std::to_string(distance);
str += " rank: " + std::to_string(my_rank);
str += " start: " + std::to_string(start);
str += " end: " + std::to_string(end);
str += " distance: " + std::to_string(distance);
str += " e: " + std::to_string(start + distance);
str += "n";
std::cerr << str;
barrier.wait();
std::memcpy(to_data.data() + start, from_data.data() + start, distance);
barrier.wait();
if (my_rank == 0)
for (auto y = 0; y < from_data.get_rows(); y++) {
for (auto x = 0; x < from_data.get_columns(); x++) {
if (to_data(x, y) != from_data(x, y)) {
std::cerr << "x: " << x << 't' << "y: " << y << "tt";
std::cerr << "to: " << to_data(x, y) << 't' << "from: " << from_data(x, y) << 'n';
}
}
}
barrier.wait();
}
int main() {
auto const num_threads = 1;
// auto const num_threads = 4;
// auto const width = 64;
// auto const height = 64;
auto const width = 97;
auto const height = 101;
auto from_data = Matrix<T>(width, height, 70);
auto to_data = Matrix<T>(width, height, 84);
std::vector<std::thread> threads;
auto barrier = Barrier{num_threads};
for (auto i = 0; i < num_threads; i++) {
threads.emplace_back(run, i, num_threads, std::ref(barrier), std::ref(from_data), std::ref(to_data));
}
for (auto& thread : threads) {
thread.join();
}
}
c++ multithreading pointers c++14 memcpy
add a comment |
I am trying to copy a matrix in parallel. Below is the code that I am working with. Currently, it works as expected with char, but it seg faults when I use shorts. I assume that the bug is in copying outside of the memory outside of the vector. I have tried to debug my assumption without success.
CMakeLists.txt
cmake_minimum_required(VERSION 3.0)
project(memcpy CXX)
find_package (Threads)
add_executable(memcpy main.cpp)
set_property(TARGET memcpy PROPERTY CXX_STANDARD 17)
target_link_libraries (memcpy ${CMAKE_THREAD_LIBS_INIT})
main.cpp
#include <cassert>
#include <condition_variable>
#include <cstring>
#include <iostream>
#include <mutex>
#include <string>
#include <thread>
#include <vector>
class Barrier {
public:
explicit Barrier(std::size_t const count) : m_threshold(count), m_remaining(count), m_generation(0) {}
void wait() {
auto local = std::unique_lock<std::mutex>{m_mutex};
auto current_generation = m_generation;
m_remaining--;
if (!m_remaining) {
m_generation++;
m_remaining = m_threshold;
m_condition.notify_all();
} else {
m_condition.wait(local, [this, current_generation] { return current_generation != m_generation; });
}
}
private:
std::mutex m_mutex;
std::condition_variable m_condition;
std::size_t m_threshold;
std::size_t m_remaining;
std::size_t m_generation;
};
template <typename T>
class Matrix {
using reference = typename std::vector<T>::reference;
using const_reference = typename std::vector<T>::const_reference;
public:
Matrix(std::size_t rows, std::size_t cols) : m_rows(rows), m_cols(cols), m_data(m_cols * m_rows) {}
Matrix(std::size_t rows, std::size_t cols, T const& default_val) : m_rows(rows), m_cols(cols), m_data(m_cols * m_rows, default_val) {}
constexpr std::size_t get_columns() const { return m_cols; }
constexpr std::size_t get_rows() const { return m_rows; }
constexpr std::size_t get_element_count() const {
assert(m_cols * m_rows == m_data.size());
return m_cols * m_rows;
}
T* data() { return m_data.data(); }
T const* data() const { return m_data.data(); }
reference operator()(std::size_t const column_x, std::size_t const row_y) {
assert(0 <= column_x);
assert(column_x < get_columns());
assert(0 <= row_y);
assert(row_y < get_rows());
return m_data[row_y * m_cols + column_x];
}
const_reference operator()(std::size_t const column_x, std::size_t const row_y) const {
assert(0 <= column_x);
assert(column_x < get_columns());
assert(0 <= row_y);
assert(row_y < get_rows());
return m_data[row_y * m_cols + column_x];
}
private:
std::size_t const m_rows;
std::size_t const m_cols;
std::vector<T> m_data;
};
static_assert(false, "FIX ME");
using T = char;
// using T = short;
// using T = int;
// using T = double;
void run(std::size_t const my_rank, std::size_t const num_threads, Barrier& barrier, Matrix<T> const& from_data, Matrix<T>& to_data) {
auto n = from_data.get_element_count();
std::string str;
if (my_rank == 0) {
std::cerr << "bytes to copy: " << (n * sizeof(T)) << 'n';
}
// initialization
std::size_t segment_size = n / num_threads;
std::size_t start = (my_rank * segment_size) * sizeof(T);
std::size_t end = ((my_rank + 1) * segment_size) * sizeof(T);
std::size_t distance = end - start;
str += " my_rank: " + std::to_string(my_rank);
str += " segment_size: " + std::to_string(segment_size);
str += " start: " + std::to_string(start);
str += " end: " + std::to_string(end);
str += " distance: " + std::to_string(distance);
str += " rank: " + std::to_string(my_rank);
str += " start: " + std::to_string(start);
str += " end: " + std::to_string(end);
str += " distance: " + std::to_string(distance);
str += " e: " + std::to_string(start + distance);
str += "n";
std::cerr << str;
barrier.wait();
std::memcpy(to_data.data() + start, from_data.data() + start, distance);
barrier.wait();
if (my_rank == 0)
for (auto y = 0; y < from_data.get_rows(); y++) {
for (auto x = 0; x < from_data.get_columns(); x++) {
if (to_data(x, y) != from_data(x, y)) {
std::cerr << "x: " << x << 't' << "y: " << y << "tt";
std::cerr << "to: " << to_data(x, y) << 't' << "from: " << from_data(x, y) << 'n';
}
}
}
barrier.wait();
}
int main() {
auto const num_threads = 1;
// auto const num_threads = 4;
// auto const width = 64;
// auto const height = 64;
auto const width = 97;
auto const height = 101;
auto from_data = Matrix<T>(width, height, 70);
auto to_data = Matrix<T>(width, height, 84);
std::vector<std::thread> threads;
auto barrier = Barrier{num_threads};
for (auto i = 0; i < num_threads; i++) {
threads.emplace_back(run, i, num_threads, std::ref(barrier), std::ref(from_data), std::ref(to_data));
}
for (auto& thread : threads) {
thread.join();
}
}
c++ multithreading pointers c++14 memcpy
I doubt you will get a significant performance improvement from threads if your goal is just to duplicate data in memory... maybe if you use a number of threads equal to the number of memory channels but even so, CPU clocks are already about the double the memory's, which makes it four times faster considering you have to read and then write back, the memory bus is the bottleneck here, not the CPU.
– Havenard
Nov 15 '18 at 4:14
1
auto from_data = Matrix<std::string>(width, height, 70);
-- Your code is instantly broken. If you had considered things like this, you would have never usedmemcpy
. Never usestd::memcpy
if there is a chance that the thing you're copying could be non-POD. Compilers these days are smart enough to choose what type of copy to use when you usestd::copy
instead (either memcpy, a loop, etc.).
– PaulMcKenzie
Nov 15 '18 at 4:32
@Havenard The threads already exist at this point in the code and would have nothing to do but wait around for the memcpy to finish. (it's also for a university assignment)
– Brandon
Nov 15 '18 at 4:33
add a comment |
I am trying to copy a matrix in parallel. Below is the code that I am working with. Currently, it works as expected with char, but it seg faults when I use shorts. I assume that the bug is in copying outside of the memory outside of the vector. I have tried to debug my assumption without success.
CMakeLists.txt
cmake_minimum_required(VERSION 3.0)
project(memcpy CXX)
find_package (Threads)
add_executable(memcpy main.cpp)
set_property(TARGET memcpy PROPERTY CXX_STANDARD 17)
target_link_libraries (memcpy ${CMAKE_THREAD_LIBS_INIT})
main.cpp
#include <cassert>
#include <condition_variable>
#include <cstring>
#include <iostream>
#include <mutex>
#include <string>
#include <thread>
#include <vector>
class Barrier {
public:
explicit Barrier(std::size_t const count) : m_threshold(count), m_remaining(count), m_generation(0) {}
void wait() {
auto local = std::unique_lock<std::mutex>{m_mutex};
auto current_generation = m_generation;
m_remaining--;
if (!m_remaining) {
m_generation++;
m_remaining = m_threshold;
m_condition.notify_all();
} else {
m_condition.wait(local, [this, current_generation] { return current_generation != m_generation; });
}
}
private:
std::mutex m_mutex;
std::condition_variable m_condition;
std::size_t m_threshold;
std::size_t m_remaining;
std::size_t m_generation;
};
template <typename T>
class Matrix {
using reference = typename std::vector<T>::reference;
using const_reference = typename std::vector<T>::const_reference;
public:
Matrix(std::size_t rows, std::size_t cols) : m_rows(rows), m_cols(cols), m_data(m_cols * m_rows) {}
Matrix(std::size_t rows, std::size_t cols, T const& default_val) : m_rows(rows), m_cols(cols), m_data(m_cols * m_rows, default_val) {}
constexpr std::size_t get_columns() const { return m_cols; }
constexpr std::size_t get_rows() const { return m_rows; }
constexpr std::size_t get_element_count() const {
assert(m_cols * m_rows == m_data.size());
return m_cols * m_rows;
}
T* data() { return m_data.data(); }
T const* data() const { return m_data.data(); }
reference operator()(std::size_t const column_x, std::size_t const row_y) {
assert(0 <= column_x);
assert(column_x < get_columns());
assert(0 <= row_y);
assert(row_y < get_rows());
return m_data[row_y * m_cols + column_x];
}
const_reference operator()(std::size_t const column_x, std::size_t const row_y) const {
assert(0 <= column_x);
assert(column_x < get_columns());
assert(0 <= row_y);
assert(row_y < get_rows());
return m_data[row_y * m_cols + column_x];
}
private:
std::size_t const m_rows;
std::size_t const m_cols;
std::vector<T> m_data;
};
static_assert(false, "FIX ME");
using T = char;
// using T = short;
// using T = int;
// using T = double;
void run(std::size_t const my_rank, std::size_t const num_threads, Barrier& barrier, Matrix<T> const& from_data, Matrix<T>& to_data) {
auto n = from_data.get_element_count();
std::string str;
if (my_rank == 0) {
std::cerr << "bytes to copy: " << (n * sizeof(T)) << 'n';
}
// initialization
std::size_t segment_size = n / num_threads;
std::size_t start = (my_rank * segment_size) * sizeof(T);
std::size_t end = ((my_rank + 1) * segment_size) * sizeof(T);
std::size_t distance = end - start;
str += " my_rank: " + std::to_string(my_rank);
str += " segment_size: " + std::to_string(segment_size);
str += " start: " + std::to_string(start);
str += " end: " + std::to_string(end);
str += " distance: " + std::to_string(distance);
str += " rank: " + std::to_string(my_rank);
str += " start: " + std::to_string(start);
str += " end: " + std::to_string(end);
str += " distance: " + std::to_string(distance);
str += " e: " + std::to_string(start + distance);
str += "n";
std::cerr << str;
barrier.wait();
std::memcpy(to_data.data() + start, from_data.data() + start, distance);
barrier.wait();
if (my_rank == 0)
for (auto y = 0; y < from_data.get_rows(); y++) {
for (auto x = 0; x < from_data.get_columns(); x++) {
if (to_data(x, y) != from_data(x, y)) {
std::cerr << "x: " << x << 't' << "y: " << y << "tt";
std::cerr << "to: " << to_data(x, y) << 't' << "from: " << from_data(x, y) << 'n';
}
}
}
barrier.wait();
}
int main() {
auto const num_threads = 1;
// auto const num_threads = 4;
// auto const width = 64;
// auto const height = 64;
auto const width = 97;
auto const height = 101;
auto from_data = Matrix<T>(width, height, 70);
auto to_data = Matrix<T>(width, height, 84);
std::vector<std::thread> threads;
auto barrier = Barrier{num_threads};
for (auto i = 0; i < num_threads; i++) {
threads.emplace_back(run, i, num_threads, std::ref(barrier), std::ref(from_data), std::ref(to_data));
}
for (auto& thread : threads) {
thread.join();
}
}
c++ multithreading pointers c++14 memcpy
I am trying to copy a matrix in parallel. Below is the code that I am working with. Currently, it works as expected with char, but it seg faults when I use shorts. I assume that the bug is in copying outside of the memory outside of the vector. I have tried to debug my assumption without success.
CMakeLists.txt
cmake_minimum_required(VERSION 3.0)
project(memcpy CXX)
find_package (Threads)
add_executable(memcpy main.cpp)
set_property(TARGET memcpy PROPERTY CXX_STANDARD 17)
target_link_libraries (memcpy ${CMAKE_THREAD_LIBS_INIT})
main.cpp
#include <cassert>
#include <condition_variable>
#include <cstring>
#include <iostream>
#include <mutex>
#include <string>
#include <thread>
#include <vector>
class Barrier {
public:
explicit Barrier(std::size_t const count) : m_threshold(count), m_remaining(count), m_generation(0) {}
void wait() {
auto local = std::unique_lock<std::mutex>{m_mutex};
auto current_generation = m_generation;
m_remaining--;
if (!m_remaining) {
m_generation++;
m_remaining = m_threshold;
m_condition.notify_all();
} else {
m_condition.wait(local, [this, current_generation] { return current_generation != m_generation; });
}
}
private:
std::mutex m_mutex;
std::condition_variable m_condition;
std::size_t m_threshold;
std::size_t m_remaining;
std::size_t m_generation;
};
template <typename T>
class Matrix {
using reference = typename std::vector<T>::reference;
using const_reference = typename std::vector<T>::const_reference;
public:
Matrix(std::size_t rows, std::size_t cols) : m_rows(rows), m_cols(cols), m_data(m_cols * m_rows) {}
Matrix(std::size_t rows, std::size_t cols, T const& default_val) : m_rows(rows), m_cols(cols), m_data(m_cols * m_rows, default_val) {}
constexpr std::size_t get_columns() const { return m_cols; }
constexpr std::size_t get_rows() const { return m_rows; }
constexpr std::size_t get_element_count() const {
assert(m_cols * m_rows == m_data.size());
return m_cols * m_rows;
}
T* data() { return m_data.data(); }
T const* data() const { return m_data.data(); }
reference operator()(std::size_t const column_x, std::size_t const row_y) {
assert(0 <= column_x);
assert(column_x < get_columns());
assert(0 <= row_y);
assert(row_y < get_rows());
return m_data[row_y * m_cols + column_x];
}
const_reference operator()(std::size_t const column_x, std::size_t const row_y) const {
assert(0 <= column_x);
assert(column_x < get_columns());
assert(0 <= row_y);
assert(row_y < get_rows());
return m_data[row_y * m_cols + column_x];
}
private:
std::size_t const m_rows;
std::size_t const m_cols;
std::vector<T> m_data;
};
static_assert(false, "FIX ME");
using T = char;
// using T = short;
// using T = int;
// using T = double;
void run(std::size_t const my_rank, std::size_t const num_threads, Barrier& barrier, Matrix<T> const& from_data, Matrix<T>& to_data) {
auto n = from_data.get_element_count();
std::string str;
if (my_rank == 0) {
std::cerr << "bytes to copy: " << (n * sizeof(T)) << 'n';
}
// initialization
std::size_t segment_size = n / num_threads;
std::size_t start = (my_rank * segment_size) * sizeof(T);
std::size_t end = ((my_rank + 1) * segment_size) * sizeof(T);
std::size_t distance = end - start;
str += " my_rank: " + std::to_string(my_rank);
str += " segment_size: " + std::to_string(segment_size);
str += " start: " + std::to_string(start);
str += " end: " + std::to_string(end);
str += " distance: " + std::to_string(distance);
str += " rank: " + std::to_string(my_rank);
str += " start: " + std::to_string(start);
str += " end: " + std::to_string(end);
str += " distance: " + std::to_string(distance);
str += " e: " + std::to_string(start + distance);
str += "n";
std::cerr << str;
barrier.wait();
std::memcpy(to_data.data() + start, from_data.data() + start, distance);
barrier.wait();
if (my_rank == 0)
for (auto y = 0; y < from_data.get_rows(); y++) {
for (auto x = 0; x < from_data.get_columns(); x++) {
if (to_data(x, y) != from_data(x, y)) {
std::cerr << "x: " << x << 't' << "y: " << y << "tt";
std::cerr << "to: " << to_data(x, y) << 't' << "from: " << from_data(x, y) << 'n';
}
}
}
barrier.wait();
}
int main() {
auto const num_threads = 1;
// auto const num_threads = 4;
// auto const width = 64;
// auto const height = 64;
auto const width = 97;
auto const height = 101;
auto from_data = Matrix<T>(width, height, 70);
auto to_data = Matrix<T>(width, height, 84);
std::vector<std::thread> threads;
auto barrier = Barrier{num_threads};
for (auto i = 0; i < num_threads; i++) {
threads.emplace_back(run, i, num_threads, std::ref(barrier), std::ref(from_data), std::ref(to_data));
}
for (auto& thread : threads) {
thread.join();
}
}
c++ multithreading pointers c++14 memcpy
c++ multithreading pointers c++14 memcpy
edited Nov 15 '18 at 11:59
Brandon
asked Nov 15 '18 at 3:53
BrandonBrandon
1931312
1931312
I doubt you will get a significant performance improvement from threads if your goal is just to duplicate data in memory... maybe if you use a number of threads equal to the number of memory channels but even so, CPU clocks are already about the double the memory's, which makes it four times faster considering you have to read and then write back, the memory bus is the bottleneck here, not the CPU.
– Havenard
Nov 15 '18 at 4:14
1
auto from_data = Matrix<std::string>(width, height, 70);
-- Your code is instantly broken. If you had considered things like this, you would have never usedmemcpy
. Never usestd::memcpy
if there is a chance that the thing you're copying could be non-POD. Compilers these days are smart enough to choose what type of copy to use when you usestd::copy
instead (either memcpy, a loop, etc.).
– PaulMcKenzie
Nov 15 '18 at 4:32
@Havenard The threads already exist at this point in the code and would have nothing to do but wait around for the memcpy to finish. (it's also for a university assignment)
– Brandon
Nov 15 '18 at 4:33
add a comment |
I doubt you will get a significant performance improvement from threads if your goal is just to duplicate data in memory... maybe if you use a number of threads equal to the number of memory channels but even so, CPU clocks are already about the double the memory's, which makes it four times faster considering you have to read and then write back, the memory bus is the bottleneck here, not the CPU.
– Havenard
Nov 15 '18 at 4:14
1
auto from_data = Matrix<std::string>(width, height, 70);
-- Your code is instantly broken. If you had considered things like this, you would have never usedmemcpy
. Never usestd::memcpy
if there is a chance that the thing you're copying could be non-POD. Compilers these days are smart enough to choose what type of copy to use when you usestd::copy
instead (either memcpy, a loop, etc.).
– PaulMcKenzie
Nov 15 '18 at 4:32
@Havenard The threads already exist at this point in the code and would have nothing to do but wait around for the memcpy to finish. (it's also for a university assignment)
– Brandon
Nov 15 '18 at 4:33
I doubt you will get a significant performance improvement from threads if your goal is just to duplicate data in memory... maybe if you use a number of threads equal to the number of memory channels but even so, CPU clocks are already about the double the memory's, which makes it four times faster considering you have to read and then write back, the memory bus is the bottleneck here, not the CPU.
– Havenard
Nov 15 '18 at 4:14
I doubt you will get a significant performance improvement from threads if your goal is just to duplicate data in memory... maybe if you use a number of threads equal to the number of memory channels but even so, CPU clocks are already about the double the memory's, which makes it four times faster considering you have to read and then write back, the memory bus is the bottleneck here, not the CPU.
– Havenard
Nov 15 '18 at 4:14
1
1
auto from_data = Matrix<std::string>(width, height, 70);
-- Your code is instantly broken. If you had considered things like this, you would have never used memcpy
. Never use std::memcpy
if there is a chance that the thing you're copying could be non-POD. Compilers these days are smart enough to choose what type of copy to use when you use std::copy
instead (either memcpy, a loop, etc.).– PaulMcKenzie
Nov 15 '18 at 4:32
auto from_data = Matrix<std::string>(width, height, 70);
-- Your code is instantly broken. If you had considered things like this, you would have never used memcpy
. Never use std::memcpy
if there is a chance that the thing you're copying could be non-POD. Compilers these days are smart enough to choose what type of copy to use when you use std::copy
instead (either memcpy, a loop, etc.).– PaulMcKenzie
Nov 15 '18 at 4:32
@Havenard The threads already exist at this point in the code and would have nothing to do but wait around for the memcpy to finish. (it's also for a university assignment)
– Brandon
Nov 15 '18 at 4:33
@Havenard The threads already exist at this point in the code and would have nothing to do but wait around for the memcpy to finish. (it's also for a university assignment)
– Brandon
Nov 15 '18 at 4:33
add a comment |
1 Answer
1
active
oldest
votes
std::memcpy(to_data.data() + start, from_data.data() + start, distance)
std::vector<T>::data()
returns a T*
so if you add an integral value foo
to it, you effectively add foo * sizeof T
bytes ... but you allready multiplied with sizeof(T)
earlier when calculating start
and end
. Also, std::memcpy()
won't work for T
s that are not PODs.
Better use std::copy()
.
2
Also,std::copy
is written to be smart enough to fall back tostd::memcpy
or equivalent if the type that is detected is trivially copyable. So you're not losing anything by usingstd::copy
.
– PaulMcKenzie
Nov 15 '18 at 4:40
add a comment |
Your Answer
StackExchange.ifUsing("editor", function () {
StackExchange.using("externalEditor", function () {
StackExchange.using("snippets", function () {
StackExchange.snippets.init();
});
});
}, "code-snippets");
StackExchange.ready(function() {
var channelOptions = {
tags: "".split(" "),
id: "1"
};
initTagRenderer("".split(" "), "".split(" "), channelOptions);
StackExchange.using("externalEditor", function() {
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled) {
StackExchange.using("snippets", function() {
createEditor();
});
}
else {
createEditor();
}
});
function createEditor() {
StackExchange.prepareEditor({
heartbeatType: 'answer',
autoActivateHeartbeat: false,
convertImagesToLinks: true,
noModals: true,
showLowRepImageUploadWarning: true,
reputationToPostImages: 10,
bindNavPrevention: true,
postfix: "",
imageUploader: {
brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
allowUrls: true
},
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
});
}
});
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53312199%2fparallel-memcpy-in-cpp%23new-answer', 'question_page');
}
);
Post as a guest
Required, but never shown
1 Answer
1
active
oldest
votes
1 Answer
1
active
oldest
votes
active
oldest
votes
active
oldest
votes
std::memcpy(to_data.data() + start, from_data.data() + start, distance)
std::vector<T>::data()
returns a T*
so if you add an integral value foo
to it, you effectively add foo * sizeof T
bytes ... but you allready multiplied with sizeof(T)
earlier when calculating start
and end
. Also, std::memcpy()
won't work for T
s that are not PODs.
Better use std::copy()
.
2
Also,std::copy
is written to be smart enough to fall back tostd::memcpy
or equivalent if the type that is detected is trivially copyable. So you're not losing anything by usingstd::copy
.
– PaulMcKenzie
Nov 15 '18 at 4:40
add a comment |
std::memcpy(to_data.data() + start, from_data.data() + start, distance)
std::vector<T>::data()
returns a T*
so if you add an integral value foo
to it, you effectively add foo * sizeof T
bytes ... but you allready multiplied with sizeof(T)
earlier when calculating start
and end
. Also, std::memcpy()
won't work for T
s that are not PODs.
Better use std::copy()
.
2
Also,std::copy
is written to be smart enough to fall back tostd::memcpy
or equivalent if the type that is detected is trivially copyable. So you're not losing anything by usingstd::copy
.
– PaulMcKenzie
Nov 15 '18 at 4:40
add a comment |
std::memcpy(to_data.data() + start, from_data.data() + start, distance)
std::vector<T>::data()
returns a T*
so if you add an integral value foo
to it, you effectively add foo * sizeof T
bytes ... but you allready multiplied with sizeof(T)
earlier when calculating start
and end
. Also, std::memcpy()
won't work for T
s that are not PODs.
Better use std::copy()
.
std::memcpy(to_data.data() + start, from_data.data() + start, distance)
std::vector<T>::data()
returns a T*
so if you add an integral value foo
to it, you effectively add foo * sizeof T
bytes ... but you allready multiplied with sizeof(T)
earlier when calculating start
and end
. Also, std::memcpy()
won't work for T
s that are not PODs.
Better use std::copy()
.
answered Nov 15 '18 at 4:06
SwordfishSwordfish
9,34811336
9,34811336
2
Also,std::copy
is written to be smart enough to fall back tostd::memcpy
or equivalent if the type that is detected is trivially copyable. So you're not losing anything by usingstd::copy
.
– PaulMcKenzie
Nov 15 '18 at 4:40
add a comment |
2
Also,std::copy
is written to be smart enough to fall back tostd::memcpy
or equivalent if the type that is detected is trivially copyable. So you're not losing anything by usingstd::copy
.
– PaulMcKenzie
Nov 15 '18 at 4:40
2
2
Also,
std::copy
is written to be smart enough to fall back to std::memcpy
or equivalent if the type that is detected is trivially copyable. So you're not losing anything by using std::copy
.– PaulMcKenzie
Nov 15 '18 at 4:40
Also,
std::copy
is written to be smart enough to fall back to std::memcpy
or equivalent if the type that is detected is trivially copyable. So you're not losing anything by using std::copy
.– PaulMcKenzie
Nov 15 '18 at 4:40
add a comment |
Thanks for contributing an answer to Stack Overflow!
- Please be sure to answer the question. Provide details and share your research!
But avoid …
- Asking for help, clarification, or responding to other answers.
- Making statements based on opinion; back them up with references or personal experience.
To learn more, see our tips on writing great answers.
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53312199%2fparallel-memcpy-in-cpp%23new-answer', 'question_page');
}
);
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
I doubt you will get a significant performance improvement from threads if your goal is just to duplicate data in memory... maybe if you use a number of threads equal to the number of memory channels but even so, CPU clocks are already about the double the memory's, which makes it four times faster considering you have to read and then write back, the memory bus is the bottleneck here, not the CPU.
– Havenard
Nov 15 '18 at 4:14
1
auto from_data = Matrix<std::string>(width, height, 70);
-- Your code is instantly broken. If you had considered things like this, you would have never usedmemcpy
. Never usestd::memcpy
if there is a chance that the thing you're copying could be non-POD. Compilers these days are smart enough to choose what type of copy to use when you usestd::copy
instead (either memcpy, a loop, etc.).– PaulMcKenzie
Nov 15 '18 at 4:32
@Havenard The threads already exist at this point in the code and would have nothing to do but wait around for the memcpy to finish. (it's also for a university assignment)
– Brandon
Nov 15 '18 at 4:33