sm4_cl.c 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365
  1. #include <stdio.h>
  2. #include <string.h>
  3. #include <stdlib.h>
  4. #include <stdint.h>
  5. #include <gmssl/sm4.h>
  6. #define MACOS
  7. #ifdef MACOS
  8. #include <OpenCL/OpenCL.h>
  9. #else
  10. #include <CL/cl.h>
  11. #endif
  12. static char *clErrorString(cl_uint err)
  13. {
  14. switch (err) {
  15. case CL_SUCCESS: return "CL_SUCCESS!";
  16. case CL_DEVICE_NOT_FOUND: return "CL_DEVICE_NOT_FOUND";
  17. case CL_DEVICE_NOT_AVAILABLE: return "CL_DEVICE_NOT_AVAILABLE";
  18. case CL_COMPILER_NOT_AVAILABLE: return "CL_COMPILER_NOT_AVAILABLE";
  19. case CL_MEM_OBJECT_ALLOCATION_FAILURE: return "CL_MEM_OBJECT_ALLOCATION_FAILURE";
  20. case CL_OUT_OF_RESOURCES: return "CL_OUT_OF_RESOURCES";
  21. case CL_OUT_OF_HOST_MEMORY: return "CL_OUT_OF_HOST_MEMORY";
  22. case CL_PROFILING_INFO_NOT_AVAILABLE: return "CL_PROFILING_INFO_NOT_AVAILABLE";
  23. case CL_MEM_COPY_OVERLAP: return "CL_MEM_COPY_OVERLAP";
  24. case CL_IMAGE_FORMAT_MISMATCH: return "CL_IMAGE_FORMAT_MISMATCH";
  25. case CL_IMAGE_FORMAT_NOT_SUPPORTED: return "CL_IMAGE_FORMAT_NOT_SUPPORTED";
  26. case CL_BUILD_PROGRAM_FAILURE: return "CL_BUILD_PROGRAM_FAILURE";
  27. case CL_MAP_FAILURE: return "CL_MAP_FAILURE";
  28. case CL_INVALID_VALUE: return "CL_INVALID_VALUE";
  29. case CL_INVALID_DEVICE_TYPE: return "CL_INVALID_DEVICE_TYPE";
  30. case CL_INVALID_PLATFORM: return "CL_INVALID_PLATFORM";
  31. case CL_INVALID_DEVICE: return "CL_INVALID_DEVICE";
  32. case CL_INVALID_CONTEXT: return "CL_INVALID_CONTEXT";
  33. case CL_INVALID_QUEUE_PROPERTIES: return "CL_INVALID_QUEUE_PROPERTIES";
  34. case CL_INVALID_COMMAND_QUEUE: return "CL_INVALID_COMMAND_QUEUE";
  35. case CL_INVALID_HOST_PTR: return "CL_INVALID_HOST_PTR";
  36. case CL_INVALID_MEM_OBJECT: return "CL_INVALID_MEM_OBJECT";
  37. case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR:return "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR";
  38. case CL_INVALID_IMAGE_SIZE: return "CL_INVALID_IMAGE_SIZE";
  39. case CL_INVALID_SAMPLER: return "CL_INVALID_SAMPLER";
  40. case CL_INVALID_BINARY: return "CL_INVALID_BINARY";
  41. case CL_INVALID_BUILD_OPTIONS: return "CL_INVALID_BUILD_OPTIONS";
  42. case CL_INVALID_PROGRAM: return "CL_INVALID_PROGRAM";
  43. case CL_INVALID_PROGRAM_EXECUTABLE: return "CL_INVALID_PROGRAM_EXECUTABLE";
  44. case CL_INVALID_KERNEL_NAME: return "CL_INVALID_KERNEL_NAME";
  45. case CL_INVALID_KERNEL_DEFINITION: return "CL_INVALID_KERNEL_DEFINITION";
  46. case CL_INVALID_KERNEL: return "CL_INVALID_KERNEL";
  47. case CL_INVALID_ARG_INDEX: return "CL_INVALID_ARG_INDEX";
  48. case CL_INVALID_ARG_VALUE: return "CL_INVALID_ARG_VALUE";
  49. case CL_INVALID_ARG_SIZE: return "CL_INVALID_ARG_SIZE";
  50. case CL_INVALID_KERNEL_ARGS: return "CL_INVALID_KERNEL_ARGS";
  51. case CL_INVALID_WORK_DIMENSION: return "CL_INVALID_WORK_DIMENSION";
  52. case CL_INVALID_WORK_GROUP_SIZE: return "CL_INVALID_WORK_GROUP_SIZE";
  53. case CL_INVALID_WORK_ITEM_SIZE: return "CL_INVALID_WORK_ITEM_SIZE";
  54. case CL_INVALID_GLOBAL_OFFSET: return "CL_INVALID_GLOBAL_OFFSET";
  55. case CL_INVALID_EVENT_WAIT_LIST: return "CL_INVALID_EVENT_WAIT_LIST";
  56. case CL_INVALID_EVENT: return "CL_INVALID_EVENT";
  57. case CL_INVALID_OPERATION: return "CL_INVALID_OPERATION";
  58. case CL_INVALID_GL_OBJECT: return "CL_INVALID_GL_OBJECT";
  59. case CL_INVALID_BUFFER_SIZE: return "CL_INVALID_BUFFER_SIZE";
  60. case CL_INVALID_MIP_LEVEL: return "CL_INVALID_MIP_LEVEL";
  61. }
  62. return NULL;
  63. }
  64. static const char *sm4_cl_src;
  65. typedef struct {
  66. uint32_t rk[32];
  67. cl_context context;
  68. cl_command_queue queue;
  69. cl_program program;
  70. cl_kernel kernel;
  71. cl_mem mem_rk;
  72. cl_mem mem_io;
  73. size_t workgroup_size;
  74. } SM4_CL_CTX;
  75. #define cl_error_print(e) \
  76. do { fprintf(stderr, "%s: %d: %s()\n",__FILE__,__LINE__,clErrorString(e)); } while (0)
  77. void sm4_cl_cleanup(SM4_CL_CTX *ctx)
  78. {
  79. clReleaseContext(ctx->context);
  80. clReleaseCommandQueue(ctx->queue);
  81. clReleaseProgram(ctx->program);
  82. clReleaseKernel(ctx->kernel);
  83. }
  84. static int sm4_cl_set_key(SM4_CL_CTX *ctx, const uint8_t key[16], int enc)
  85. {
  86. cl_platform_id platform;
  87. cl_device_id device;
  88. cl_uint device_cnt;
  89. cl_int err;
  90. char sval[256];
  91. size_t slen;
  92. cl_command_queue_properties queue_prop = 0;
  93. const char *build_opts = NULL;
  94. memset(ctx, 0, sizeof(*ctx));
  95. if ((err = clGetPlatformIDs(1, &platform, NULL)) != CL_SUCCESS) {
  96. cl_error_print(err);
  97. return -1;
  98. }
  99. if ((err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, &device_cnt)) != CL_SUCCESS) {
  100. cl_error_print(err);
  101. return -1;
  102. }
  103. if (!(ctx->context = clCreateContext(NULL, 1, &device, NULL, NULL, &err))) {
  104. cl_error_print(err);
  105. return -1;
  106. }
  107. if (!(ctx->queue = clCreateCommandQueue(ctx->context, device, queue_prop, &err))) {
  108. cl_error_print(err);
  109. goto end;
  110. }
  111. if (!(ctx->program = clCreateProgramWithSource(ctx->context, 1, (const char **)&sm4_cl_src, NULL, &err))) {
  112. cl_error_print(err);
  113. goto end;
  114. }
  115. if ((err = clBuildProgram(ctx->program, 1, &device, build_opts, NULL, NULL)) != CL_SUCCESS) {
  116. char *log = NULL;
  117. size_t loglen;
  118. cl_error_print(err);
  119. if ((err = clGetProgramBuildInfo(ctx->program, device, CL_PROGRAM_BUILD_LOG, sizeof(log), NULL, &loglen)) != CL_SUCCESS) {
  120. cl_error_print(err);
  121. goto end;
  122. }
  123. if (!(log = (char *)malloc(loglen))) {
  124. goto end;
  125. }
  126. if ((err = clGetProgramBuildInfo(ctx->program, device, CL_PROGRAM_BUILD_LOG, sizeof(log), NULL, &loglen)) != CL_SUCCESS) {
  127. cl_error_print(err);
  128. free(log);
  129. goto end;
  130. }
  131. fprintf(stderr, "%s %d: %s\n", __FILE__, __LINE__, log);
  132. free(log);
  133. goto end;
  134. }
  135. if (!(ctx->kernel = clCreateKernel(ctx->program, "sm4_encrypt", &err))) {
  136. cl_error_print(err);
  137. goto end;
  138. }
  139. if ((err = clGetKernelWorkGroupInfo(ctx->kernel, device, CL_KERNEL_WORK_GROUP_SIZE,
  140. sizeof(ctx->workgroup_size), &ctx->workgroup_size, NULL)) != CL_SUCCESS) {
  141. cl_error_print(err);
  142. goto end;
  143. }
  144. if (enc) {
  145. sm4_set_encrypt_key((SM4_KEY *)ctx->rk, key);
  146. } else {
  147. sm4_set_decrypt_key((SM4_KEY *)ctx->rk, key);
  148. }
  149. if (!(ctx->mem_rk = clCreateBuffer(ctx->context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, sizeof(SM4_KEY), ctx->rk, &err))) {
  150. cl_error_print(err);
  151. goto end;
  152. }
  153. if ((err = clSetKernelArg(ctx->kernel, 0, sizeof(cl_mem), &ctx->mem_rk)) != CL_SUCCESS) {
  154. cl_error_print(err);
  155. goto end;
  156. }
  157. return 1;
  158. end:
  159. return -1;
  160. }
  161. int sm4_cl_set_encrypt_key(SM4_CL_CTX *ctx, const uint8_t key[16])
  162. {
  163. return sm4_cl_set_key(ctx, key, 1);
  164. }
  165. int sm4_cl_set_decrypt_key(SM4_CL_CTX *ctx, const uint8_t key[16])
  166. {
  167. return sm4_cl_set_key(ctx, key, 0);
  168. }
  169. int sm4_cl_encrypt(SM4_CL_CTX *ctx, const uint8_t *in, size_t nblocks, uint8_t *out)
  170. {
  171. int ret = -1;
  172. cl_mem mem;
  173. cl_int err;
  174. size_t len = 16 * nblocks;
  175. cl_uint dim = 1;
  176. void *p;
  177. if (out != in)
  178. memcpy(out, in, len);
  179. if (!(mem = clCreateBuffer(ctx->context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, len, out, &err))) {
  180. cl_error_print(err);
  181. return -1;
  182. }
  183. if ((err = clSetKernelArg(ctx->kernel, 1, sizeof(cl_mem), &mem)) != CL_SUCCESS) {
  184. cl_error_print(err);
  185. goto end;
  186. }
  187. if ((err = clEnqueueNDRangeKernel(ctx->queue, ctx->kernel, dim, NULL, &nblocks, &ctx->workgroup_size, 0, NULL, NULL)) != CL_SUCCESS) {
  188. cl_error_print(err);
  189. goto end;
  190. }
  191. if (!(p = clEnqueueMapBuffer(ctx->queue, mem, CL_TRUE, 0, 0, len, 0, NULL, NULL, &err))) {
  192. cl_error_print(err);
  193. goto end;
  194. }
  195. if (p != out) {
  196. fprintf(stderr, "%s %d: shit\n", __FILE__, __LINE__);
  197. goto end;
  198. }
  199. ret = 1;
  200. end:
  201. clReleaseMemObject(mem);
  202. return ret;
  203. }
  204. int test_sm4_cl_encrypt(void)
  205. {
  206. const uint8_t key[16] = {
  207. 0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef,
  208. 0xfe, 0xdc, 0xba, 0x98, 0x76, 0x54, 0x32, 0x10,
  209. };
  210. const uint8_t plaintext[16] = {
  211. 0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef,
  212. 0xfe, 0xdc, 0xba, 0x98, 0x76, 0x54, 0x32, 0x10,
  213. };
  214. const uint8_t ciphertext[16] = {
  215. 0x68, 0x1e, 0xdf, 0x34, 0xd2, 0x06, 0x96, 0x5e,
  216. 0x86, 0xb3, 0xe9, 0x4f, 0x53, 0x6e, 0x42, 0x46,
  217. };
  218. int ret = -1;
  219. SM4_CL_CTX ctx;
  220. size_t nblocks = 1024;
  221. uint8_t *buf = NULL;
  222. size_t i;
  223. if (!(buf = (uint8_t *)malloc(16 * nblocks))) {
  224. error_print();
  225. return -1;
  226. }
  227. for (i = 0; i < nblocks; i++) {
  228. memcpy(buf + 16 * i, plaintext, 16);
  229. }
  230. if (sm4_cl_set_encrypt_key(&ctx, key) != 1) {
  231. error_print();
  232. goto end;
  233. }
  234. if (sm4_cl_encrypt(&ctx, buf, nblocks, buf) != 1) {
  235. error_print();
  236. goto end;
  237. }
  238. for (i = 0; i < nblocks; i++) {
  239. if (memcmp(buf + 16 * i, ciphertext, 16) != 0) {
  240. error_print();
  241. goto end;
  242. }
  243. }
  244. ret = 1;
  245. end:
  246. if (buf) free(buf);
  247. sm4_cl_cleanup(&ctx);
  248. return ret;
  249. }
  250. #define KERNEL(...) #__VA_ARGS__
  251. const char *sm4_cl_src = KERNEL(
  252. __constant unsigned char SBOX[256] = {
  253. 0xd6, 0x90, 0xe9, 0xfe, 0xcc, 0xe1, 0x3d, 0xb7, 0x16, 0xb6, 0x14, 0xc2, 0x28, 0xfb, 0x2c, 0x05,
  254. 0x2b, 0x67, 0x9a, 0x76, 0x2a, 0xbe, 0x04, 0xc3, 0xaa, 0x44, 0x13, 0x26, 0x49, 0x86, 0x06, 0x99,
  255. 0x9c, 0x42, 0x50, 0xf4, 0x91, 0xef, 0x98, 0x7a, 0x33, 0x54, 0x0b, 0x43, 0xed, 0xcf, 0xac, 0x62,
  256. 0xe4, 0xb3, 0x1c, 0xa9, 0xc9, 0x08, 0xe8, 0x95, 0x80, 0xdf, 0x94, 0xfa, 0x75, 0x8f, 0x3f, 0xa6,
  257. 0x47, 0x07, 0xa7, 0xfc, 0xf3, 0x73, 0x17, 0xba, 0x83, 0x59, 0x3c, 0x19, 0xe6, 0x85, 0x4f, 0xa8,
  258. 0x68, 0x6b, 0x81, 0xb2, 0x71, 0x64, 0xda, 0x8b, 0xf8, 0xeb, 0x0f, 0x4b, 0x70, 0x56, 0x9d, 0x35,
  259. 0x1e, 0x24, 0x0e, 0x5e, 0x63, 0x58, 0xd1, 0xa2, 0x25, 0x22, 0x7c, 0x3b, 0x01, 0x21, 0x78, 0x87,
  260. 0xd4, 0x00, 0x46, 0x57, 0x9f, 0xd3, 0x27, 0x52, 0x4c, 0x36, 0x02, 0xe7, 0xa0, 0xc4, 0xc8, 0x9e,
  261. 0xea, 0xbf, 0x8a, 0xd2, 0x40, 0xc7, 0x38, 0xb5, 0xa3, 0xf7, 0xf2, 0xce, 0xf9, 0x61, 0x15, 0xa1,
  262. 0xe0, 0xae, 0x5d, 0xa4, 0x9b, 0x34, 0x1a, 0x55, 0xad, 0x93, 0x32, 0x30, 0xf5, 0x8c, 0xb1, 0xe3,
  263. 0x1d, 0xf6, 0xe2, 0x2e, 0x82, 0x66, 0xca, 0x60, 0xc0, 0x29, 0x23, 0xab, 0x0d, 0x53, 0x4e, 0x6f,
  264. 0xd5, 0xdb, 0x37, 0x45, 0xde, 0xfd, 0x8e, 0x2f, 0x03, 0xff, 0x6a, 0x72, 0x6d, 0x6c, 0x5b, 0x51,
  265. 0x8d, 0x1b, 0xaf, 0x92, 0xbb, 0xdd, 0xbc, 0x7f, 0x11, 0xd9, 0x5c, 0x41, 0x1f, 0x10, 0x5a, 0xd8,
  266. 0x0a, 0xc1, 0x31, 0x88, 0xa5, 0xcd, 0x7b, 0xbd, 0x2d, 0x74, 0xd0, 0x12, 0xb8, 0xe5, 0xb4, 0xb0,
  267. 0x89, 0x69, 0x97, 0x4a, 0x0c, 0x96, 0x77, 0x7e, 0x65, 0xb9, 0xf1, 0x09, 0xc5, 0x6e, 0xc6, 0x84,
  268. 0x18, 0xf0, 0x7d, 0xec, 0x3a, 0xdc, 0x4d, 0x20, 0x79, 0xee, 0x5f, 0x3e, 0xd7, 0xcb, 0x39, 0x48,
  269. };
  270. __kernel void sm4_encrypt(__global const unsigned int *rkey, __global unsigned char *data)
  271. {
  272. __local unsigned char S[256];
  273. __local unsigned int rk[32];
  274. unsigned int x0, x1, x2, x3, x4, i, t;
  275. uint global_id = get_global_id(0);
  276. __global unsigned char *p = data + 16 * global_id;
  277. __global unsigned int *in = (__global unsigned int *)p;
  278. __global unsigned int *out = (__global unsigned int *)p;
  279. if (get_local_id(0) == 0) {
  280. for (i = 0; i < 256; i++) {
  281. S[i] = SBOX[i];
  282. }
  283. for (i = 0; i < 32; i++) {
  284. rk[i] = rkey[i];
  285. }
  286. }
  287. x0 = (in[0] >> 24) | ((in[0] >> 8) & 0xff00) | ((in[0] << 8) & 0xff0000) | (in[0] << 24);
  288. x1 = (in[1] >> 24) | ((in[1] >> 8) & 0xff00) | ((in[1] << 8) & 0xff0000) | (in[1] << 24);
  289. x2 = (in[2] >> 24) | ((in[2] >> 8) & 0xff00) | ((in[2] << 8) & 0xff0000) | (in[2] << 24);
  290. x3 = (in[3] >> 24) | ((in[3] >> 8) & 0xff00) | ((in[3] << 8) & 0xff0000) | (in[3] << 24);
  291. for (i = 0; i < 31; i++) {
  292. x4 = x1 ^ x2 ^ x3 ^ rk[i];
  293. x4 = (S[x4 >> 24] << 24) ^ (S[(x4 >> 16) & 0xff] << 16) ^ (S[(x4 >> 8) & 0xff] << 8) ^ S[x4 & 0xff];
  294. x4 = x0 ^ (x4 ^
  295. ((x4 << 2) | (x4 >> (32 - 2))) ^
  296. ((x4 << 10) | (x4 >> (32 - 10))) ^
  297. ((x4 << 18) | (x4 >> (32 - 18))) ^
  298. ((x4 << 24) | (x4 >> (32 - 24))));
  299. t = x0;
  300. x0 = x1;
  301. x1 = x2;
  302. x2 = x3;
  303. x3 = x4;
  304. x4 = t;
  305. }
  306. x4 = x1 ^ x2 ^ x3 ^ rk[i];
  307. x4 = (S[x4 >> 24] << 24) ^ (S[(x4 >> 16) & 0xff] << 16) ^ (S[(x4 >> 8) & 0xff] << 8) ^ S[x4 & 0xff];
  308. x4 = x0 ^ (x4 ^
  309. ((x4 << 2) | (x4 >> (32 - 2))) ^
  310. ((x4 << 10) | (x4 >> (32 - 10))) ^
  311. ((x4 << 18) | (x4 >> (32 - 18))) ^
  312. ((x4 << 24) | (x4 >> (32 - 24))));
  313. out[0] = (x4 >> 24) | ((x4 >> 8) & 0xff00) | ((x4 << 8) & 0xff0000) | (x4 << 24);
  314. out[1] = (x3 >> 24) | ((x3 >> 8) & 0xff00) | ((x3 << 8) & 0xff0000) | (x3 << 24);
  315. out[2] = (x2 >> 24) | ((x2 >> 8) & 0xff00) | ((x2 << 8) & 0xff0000) | (x2 << 24);
  316. out[3] = (x1 >> 24) | ((x1 >> 8) & 0xff00) | ((x1 << 8) & 0xff0000) | (x1 << 24);
  317. }
  318. );