diff options
Diffstat (limited to 'Godeps/_workspace/src/github.com/ethereum/ethash/libethash-cuda/cuPrintf.cu')
-rw-r--r-- | Godeps/_workspace/src/github.com/ethereum/ethash/libethash-cuda/cuPrintf.cu | 879 |
1 files changed, 879 insertions, 0 deletions
diff --git a/Godeps/_workspace/src/github.com/ethereum/ethash/libethash-cuda/cuPrintf.cu b/Godeps/_workspace/src/github.com/ethereum/ethash/libethash-cuda/cuPrintf.cu new file mode 100644 index 000000000..f06653f2d --- /dev/null +++ b/Godeps/_workspace/src/github.com/ethereum/ethash/libethash-cuda/cuPrintf.cu @@ -0,0 +1,879 @@ +/* + Copyright 2009 NVIDIA Corporation. All rights reserved. + + NOTICE TO LICENSEE: + + This source code and/or documentation ("Licensed Deliverables") are subject + to NVIDIA intellectual property rights under U.S. and international Copyright + laws. + + These Licensed Deliverables contained herein is PROPRIETARY and CONFIDENTIAL + to NVIDIA and is being provided under the terms and conditions of a form of + NVIDIA software license agreement by and between NVIDIA and Licensee ("License + Agreement") or electronically accepted by Licensee. Notwithstanding any terms + or conditions to the contrary in the License Agreement, reproduction or + disclosure of the Licensed Deliverables to any third party without the express + written consent of NVIDIA is prohibited. + + NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE LICENSE AGREEMENT, + NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THESE LICENSED + DELIVERABLES FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED + WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE + LICENSED DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. NOTWITHSTANDING ANY + TERMS OR CONDITIONS TO THE CONTRARY IN THE LICENSE AGREEMENT, IN NO EVENT SHALL + NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, + OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER + IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THESE LICENSED DELIVERABLES. + + U.S. Government End Users. These Licensed Deliverables are a "commercial item" + as that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of + "commercial computer software" and "commercial computer software documentation" + as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) and is provided to the + U.S. Government only as a commercial end item. Consistent with 48 C.F.R.12.212 + and 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all U.S. Government + End Users acquire the Licensed Deliverables with only those rights set forth + herein. + + Any use of the Licensed Deliverables in individual and commercial software must + include, in the user documentation and internal comments to the code, the above + Disclaimer and U.S. Government End Users Notice. + */ + +/* + * cuPrintf.cu + * + * This is a printf command callable from within a kernel. It is set + * up so that output is sent to a memory buffer, which is emptied from + * the host side - but only after a cudaThreadSynchronize() on the host. + * + * Currently, there is a limitation of around 200 characters of output + * and no more than 10 arguments to a single cuPrintf() call. Issue + * multiple calls if longer format strings are required. + * + * It requires minimal setup, and is *NOT* optimised for performance. + * For example, writes are not coalesced - this is because there is an + * assumption that people will not want to printf from every single one + * of thousands of threads, but only from individual threads at a time. + * + * Using this is simple - it requires one host-side call to initialise + * everything, and then kernels can call cuPrintf at will. Sample code + * is the easiest way to demonstrate: + * + #include "cuPrintf.cu" + + __global__ void testKernel(int val) + { + cuPrintf("Value is: %d\n", val); + } + + int main() + { + cudaPrintfInit(); + testKernel<<< 2, 3 >>>(10); + cudaPrintfDisplay(stdout, true); + cudaPrintfEnd(); + return 0; + } + * + * See the header file, "cuPrintf.cuh" for more info, especially + * arguments to cudaPrintfInit() and cudaPrintfDisplay(); + */ + +#ifndef CUPRINTF_CU +#define CUPRINTF_CU + +#include "cuPrintf.cuh" +#if __CUDA_ARCH__ > 100 // Atomics only used with > sm_10 architecture +#include <sm_11_atomic_functions.h> +#endif + +// This is the smallest amount of memory, per-thread, which is allowed. +// It is also the largest amount of space a single printf() can take up +const static int CUPRINTF_MAX_LEN = 256; + +// This structure is used internally to track block/thread output restrictions. +typedef struct __align__(8) { + int threadid; // CUPRINTF_UNRESTRICTED for unrestricted + int blockid; // CUPRINTF_UNRESTRICTED for unrestricted +} cuPrintfRestriction; + +// The main storage is in a global print buffer, which has a known +// start/end/length. These are atomically updated so it works as a +// circular buffer. +// Since the only control primitive that can be used is atomicAdd(), +// we cannot wrap the pointer as such. The actual address must be +// calculated from printfBufferPtr by mod-ing with printfBufferLength. +// For sm_10 architecture, we must subdivide the buffer per-thread +// since we do not even have an atomic primitive. +__constant__ static char *globalPrintfBuffer = NULL; // Start of circular buffer (set up by host) +__constant__ static int printfBufferLength = 0; // Size of circular buffer (set up by host) +__device__ static cuPrintfRestriction restrictRules; // Output restrictions +__device__ volatile static char *printfBufferPtr = NULL; // Current atomically-incremented non-wrapped offset + +// This is the header preceeding all printf entries. +// NOTE: It *must* be size-aligned to the maximum entity size (size_t) +typedef struct __align__(8) { + unsigned short magic; // Magic number says we're valid + unsigned short fmtoffset; // Offset of fmt string into buffer + unsigned short blockid; // Block ID of author + unsigned short threadid; // Thread ID of author +} cuPrintfHeader; + +// Special header for sm_10 architecture +#define CUPRINTF_SM10_MAGIC 0xC810 // Not a valid ascii character +typedef struct __align__(16) { + unsigned short magic; // sm_10 specific magic number + unsigned short unused; + unsigned int thread_index; // thread ID for this buffer + unsigned int thread_buf_len; // per-thread buffer length + unsigned int offset; // most recent printf's offset +} cuPrintfHeaderSM10; + + +// Because we can't write an element which is not aligned to its bit-size, +// we have to align all sizes and variables on maximum-size boundaries. +// That means sizeof(double) in this case, but we'll use (long long) for +// better arch<1.3 support +#define CUPRINTF_ALIGN_SIZE sizeof(long long) + +// All our headers are prefixed with a magic number so we know they're ready +#define CUPRINTF_SM11_MAGIC (unsigned short)0xC811 // Not a valid ascii character + + +// +// getNextPrintfBufPtr +// +// Grabs a block of space in the general circular buffer, using an +// atomic function to ensure that it's ours. We handle wrapping +// around the circular buffer and return a pointer to a place which +// can be written to. +// +// Important notes: +// 1. We always grab CUPRINTF_MAX_LEN bytes +// 2. Because of 1, we never worry about wrapping around the end +// 3. Because of 1, printfBufferLength *must* be a factor of CUPRINTF_MAX_LEN +// +// This returns a pointer to the place where we own. +// +__device__ static char *getNextPrintfBufPtr() +{ + // Initialisation check + if(!printfBufferPtr) + return NULL; + + // Thread/block restriction check + if((restrictRules.blockid != CUPRINTF_UNRESTRICTED) && (restrictRules.blockid != (blockIdx.x + gridDim.x*blockIdx.y))) + return NULL; + if((restrictRules.threadid != CUPRINTF_UNRESTRICTED) && (restrictRules.threadid != (threadIdx.x + blockDim.x*threadIdx.y + blockDim.x*blockDim.y*threadIdx.z))) + return NULL; + + // Conditional section, dependent on architecture +#if __CUDA_ARCH__ == 100 + // For sm_10 architectures, we have no atomic add - this means we must split the + // entire available buffer into per-thread blocks. Inefficient, but what can you do. + int thread_count = (gridDim.x * gridDim.y) * (blockDim.x * blockDim.y * blockDim.z); + int thread_index = threadIdx.x + blockDim.x*threadIdx.y + blockDim.x*blockDim.y*threadIdx.z + + (blockIdx.x + gridDim.x*blockIdx.y) * (blockDim.x * blockDim.y * blockDim.z); + + // Find our own block of data and go to it. Make sure the per-thread length + // is a precise multiple of CUPRINTF_MAX_LEN, otherwise we risk size and + // alignment issues! We must round down, of course. + unsigned int thread_buf_len = printfBufferLength / thread_count; + thread_buf_len &= ~(CUPRINTF_MAX_LEN-1); + + // We *must* have a thread buffer length able to fit at least two printfs (one header, one real) + if(thread_buf_len < (CUPRINTF_MAX_LEN * 2)) + return NULL; + + // Now address our section of the buffer. The first item is a header. + char *myPrintfBuffer = globalPrintfBuffer + (thread_buf_len * thread_index); + cuPrintfHeaderSM10 hdr = *(cuPrintfHeaderSM10 *)(void *)myPrintfBuffer; + if(hdr.magic != CUPRINTF_SM10_MAGIC) + { + // If our header is not set up, initialise it + hdr.magic = CUPRINTF_SM10_MAGIC; + hdr.thread_index = thread_index; + hdr.thread_buf_len = thread_buf_len; + hdr.offset = 0; // Note we start at 0! We pre-increment below. + *(cuPrintfHeaderSM10 *)(void *)myPrintfBuffer = hdr; // Write back the header + + // For initial setup purposes, we might need to init thread0's header too + // (so that cudaPrintfDisplay() below will work). This is only run once. + cuPrintfHeaderSM10 *tophdr = (cuPrintfHeaderSM10 *)(void *)globalPrintfBuffer; + tophdr->thread_buf_len = thread_buf_len; + } + + // Adjust the offset by the right amount, and wrap it if need be + unsigned int offset = hdr.offset + CUPRINTF_MAX_LEN; + if(offset >= hdr.thread_buf_len) + offset = CUPRINTF_MAX_LEN; + + // Write back the new offset for next time and return a pointer to it + ((cuPrintfHeaderSM10 *)(void *)myPrintfBuffer)->offset = offset; + return myPrintfBuffer + offset; +#else + // Much easier with an atomic operation! + size_t offset = atomicAdd((unsigned int *)&printfBufferPtr, CUPRINTF_MAX_LEN) - (size_t)globalPrintfBuffer; + offset %= printfBufferLength; + return globalPrintfBuffer + offset; +#endif +} + + +// +// writePrintfHeader +// +// Inserts the header for containing our UID, fmt position and +// block/thread number. We generate it dynamically to avoid +// issues arising from requiring pre-initialisation. +// +__device__ static void writePrintfHeader(char *ptr, char *fmtptr) +{ + if(ptr) + { + cuPrintfHeader header; + header.magic = CUPRINTF_SM11_MAGIC; + header.fmtoffset = (unsigned short)(fmtptr - ptr); + header.blockid = blockIdx.x + gridDim.x*blockIdx.y; + header.threadid = threadIdx.x + blockDim.x*threadIdx.y + blockDim.x*blockDim.y*threadIdx.z; + *(cuPrintfHeader *)(void *)ptr = header; + } +} + + +// +// cuPrintfStrncpy +// +// This special strncpy outputs an aligned length value, followed by the +// string. It then zero-pads the rest of the string until a 64-aligned +// boundary. The length *includes* the padding. A pointer to the byte +// just after the \0 is returned. +// +// This function could overflow CUPRINTF_MAX_LEN characters in our buffer. +// To avoid it, we must count as we output and truncate where necessary. +// +__device__ static char *cuPrintfStrncpy(char *dest, const char *src, int n, char *end) +{ + // Initialisation and overflow check + if(!dest || !src || (dest >= end)) + return NULL; + + // Prepare to write the length specifier. We're guaranteed to have + // at least "CUPRINTF_ALIGN_SIZE" bytes left because we only write out in + // chunks that size, and CUPRINTF_MAX_LEN is aligned with CUPRINTF_ALIGN_SIZE. + int *lenptr = (int *)(void *)dest; + int len = 0; + dest += CUPRINTF_ALIGN_SIZE; + + // Now copy the string + while(n--) + { + if(dest >= end) // Overflow check + break; + + len++; + *dest++ = *src; + if(*src++ == '\0') + break; + } + + // Now write out the padding bytes, and we have our length. + while((dest < end) && (((long)dest & (CUPRINTF_ALIGN_SIZE-1)) != 0)) + { + len++; + *dest++ = 0; + } + *lenptr = len; + return (dest < end) ? dest : NULL; // Overflow means return NULL +} + + +// +// copyArg +// +// This copies a length specifier and then the argument out to the +// data buffer. Templates let the compiler figure all this out at +// compile-time, making life much simpler from the programming +// point of view. I'm assuimg all (const char *) is a string, and +// everything else is the variable it points at. I'd love to see +// a better way of doing it, but aside from parsing the format +// string I can't think of one. +// +// The length of the data type is inserted at the beginning (so that +// the display can distinguish between float and double), and the +// pointer to the end of the entry is returned. +// +__device__ static char *copyArg(char *ptr, const char *arg, char *end) +{ + // Initialisation check + if(!ptr || !arg) + return NULL; + + // strncpy does all our work. We just terminate. + if((ptr = cuPrintfStrncpy(ptr, arg, CUPRINTF_MAX_LEN, end)) != NULL) + *ptr = 0; + + return ptr; +} + +template <typename T> +__device__ static char *copyArg(char *ptr, T &arg, char *end) +{ + // Initisalisation and overflow check. Alignment rules mean that + // we're at least CUPRINTF_ALIGN_SIZE away from "end", so we only need + // to check that one offset. + if(!ptr || ((ptr+CUPRINTF_ALIGN_SIZE) >= end)) + return NULL; + + // Write the length and argument + *(int *)(void *)ptr = sizeof(arg); + ptr += CUPRINTF_ALIGN_SIZE; + *(T *)(void *)ptr = arg; + ptr += CUPRINTF_ALIGN_SIZE; + *ptr = 0; + + return ptr; +} + + +// +// cuPrintf +// +// Templated printf functions to handle multiple arguments. +// Note we return the total amount of data copied, not the number +// of characters output. But then again, who ever looks at the +// return from printf() anyway? +// +// The format is to grab a block of circular buffer space, the +// start of which will hold a header and a pointer to the format +// string. We then write in all the arguments, and finally the +// format string itself. This is to make it easy to prevent +// overflow of our buffer (we support up to 10 arguments, each of +// which can be 12 bytes in length - that means that only the +// format string (or a %s) can actually overflow; so the overflow +// check need only be in the strcpy function. +// +// The header is written at the very last because that's what +// makes it look like we're done. +// +// Errors, which are basically lack-of-initialisation, are ignored +// in the called functions because NULL pointers are passed around +// + +// All printf variants basically do the same thing, setting up the +// buffer, writing all arguments, then finalising the header. For +// clarity, we'll pack the code into some big macros. +#define CUPRINTF_PREAMBLE \ + char *start, *end, *bufptr, *fmtstart; \ + if((start = getNextPrintfBufPtr()) == NULL) return 0; \ + end = start + CUPRINTF_MAX_LEN; \ + bufptr = start + sizeof(cuPrintfHeader); + +// Posting an argument is easy +#define CUPRINTF_ARG(argname) \ + bufptr = copyArg(bufptr, argname, end); + +// After args are done, record start-of-fmt and write the fmt and header +#define CUPRINTF_POSTAMBLE \ + fmtstart = bufptr; \ + end = cuPrintfStrncpy(bufptr, fmt, CUPRINTF_MAX_LEN, end); \ + writePrintfHeader(start, end ? fmtstart : NULL); \ + return end ? (int)(end - start) : 0; + +__device__ int cuPrintf(const char *fmt) +{ + CUPRINTF_PREAMBLE; + + CUPRINTF_POSTAMBLE; +} +template <typename T1> __device__ int cuPrintf(const char *fmt, T1 arg1) +{ + CUPRINTF_PREAMBLE; + + CUPRINTF_ARG(arg1); + + CUPRINTF_POSTAMBLE; +} +template <typename T1, typename T2> __device__ int cuPrintf(const char *fmt, T1 arg1, T2 arg2) +{ + CUPRINTF_PREAMBLE; + + CUPRINTF_ARG(arg1); + CUPRINTF_ARG(arg2); + + CUPRINTF_POSTAMBLE; +} +template <typename T1, typename T2, typename T3> __device__ int cuPrintf(const char *fmt, T1 arg1, T2 arg2, T3 arg3) +{ + CUPRINTF_PREAMBLE; + + CUPRINTF_ARG(arg1); + CUPRINTF_ARG(arg2); + CUPRINTF_ARG(arg3); + + CUPRINTF_POSTAMBLE; +} +template <typename T1, typename T2, typename T3, typename T4> __device__ int cuPrintf(const char *fmt, T1 arg1, T2 arg2, T3 arg3, T4 arg4) +{ + CUPRINTF_PREAMBLE; + + CUPRINTF_ARG(arg1); + CUPRINTF_ARG(arg2); + CUPRINTF_ARG(arg3); + CUPRINTF_ARG(arg4); + + CUPRINTF_POSTAMBLE; +} +template <typename T1, typename T2, typename T3, typename T4, typename T5> __device__ int cuPrintf(const char *fmt, T1 arg1, T2 arg2, T3 arg3, T4 arg4, T5 arg5) +{ + CUPRINTF_PREAMBLE; + + CUPRINTF_ARG(arg1); + CUPRINTF_ARG(arg2); + CUPRINTF_ARG(arg3); + CUPRINTF_ARG(arg4); + CUPRINTF_ARG(arg5); + + CUPRINTF_POSTAMBLE; +} +template <typename T1, typename T2, typename T3, typename T4, typename T5, typename T6> __device__ int cuPrintf(const char *fmt, T1 arg1, T2 arg2, T3 arg3, T4 arg4, T5 arg5, T6 arg6) +{ + CUPRINTF_PREAMBLE; + + CUPRINTF_ARG(arg1); + CUPRINTF_ARG(arg2); + CUPRINTF_ARG(arg3); + CUPRINTF_ARG(arg4); + CUPRINTF_ARG(arg5); + CUPRINTF_ARG(arg6); + CUPRINTF_POSTAMBLE; +} +template <typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7> __device__ int cuPrintf(const char *fmt, T1 arg1, T2 arg2, T3 arg3, T4 arg4, T5 arg5, T6 arg6, T7 arg7) +{ + CUPRINTF_PREAMBLE; + + CUPRINTF_ARG(arg1); + CUPRINTF_ARG(arg2); + CUPRINTF_ARG(arg3); + CUPRINTF_ARG(arg4); + CUPRINTF_ARG(arg5); + CUPRINTF_ARG(arg6); + CUPRINTF_ARG(arg7); + + CUPRINTF_POSTAMBLE; +} +template <typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7, typename T8> __device__ int cuPrintf(const char *fmt, T1 arg1, T2 arg2, T3 arg3, T4 arg4, T5 arg5, T6 arg6, T7 arg7, T8 arg8) +{ + CUPRINTF_PREAMBLE; + + CUPRINTF_ARG(arg1); + CUPRINTF_ARG(arg2); + CUPRINTF_ARG(arg3); + CUPRINTF_ARG(arg4); + CUPRINTF_ARG(arg5); + CUPRINTF_ARG(arg6); + CUPRINTF_ARG(arg7); + CUPRINTF_ARG(arg8); + + CUPRINTF_POSTAMBLE; +} +template <typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7, typename T8, typename T9> __device__ int cuPrintf(const char *fmt, T1 arg1, T2 arg2, T3 arg3, T4 arg4, T5 arg5, T6 arg6, T7 arg7, T8 arg8, T9 arg9) +{ + CUPRINTF_PREAMBLE; + + CUPRINTF_ARG(arg1); + CUPRINTF_ARG(arg2); + CUPRINTF_ARG(arg3); + CUPRINTF_ARG(arg4); + CUPRINTF_ARG(arg5); + CUPRINTF_ARG(arg6); + CUPRINTF_ARG(arg7); + CUPRINTF_ARG(arg8); + CUPRINTF_ARG(arg9); + + CUPRINTF_POSTAMBLE; +} +template <typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7, typename T8, typename T9, typename T10> __device__ int cuPrintf(const char *fmt, T1 arg1, T2 arg2, T3 arg3, T4 arg4, T5 arg5, T6 arg6, T7 arg7, T8 arg8, T9 arg9, T10 arg10) +{ + CUPRINTF_PREAMBLE; + + CUPRINTF_ARG(arg1); + CUPRINTF_ARG(arg2); + CUPRINTF_ARG(arg3); + CUPRINTF_ARG(arg4); + CUPRINTF_ARG(arg5); + CUPRINTF_ARG(arg6); + CUPRINTF_ARG(arg7); + CUPRINTF_ARG(arg8); + CUPRINTF_ARG(arg9); + CUPRINTF_ARG(arg10); + + CUPRINTF_POSTAMBLE; +} +#undef CUPRINTF_PREAMBLE +#undef CUPRINTF_ARG +#undef CUPRINTF_POSTAMBLE + + +// +// cuPrintfRestrict +// +// Called to restrict output to a given thread/block. +// We store the info in "restrictRules", which is set up at +// init time by the host. It's not the cleanest way to do this +// because it means restrictions will last between +// invocations, but given the output-pointer continuity, +// I feel this is reasonable. +// +__device__ void cuPrintfRestrict(int threadid, int blockid) +{ + int thread_count = blockDim.x * blockDim.y * blockDim.z; + if(((threadid < thread_count) && (threadid >= 0)) || (threadid == CUPRINTF_UNRESTRICTED)) + restrictRules.threadid = threadid; + + int block_count = gridDim.x * gridDim.y; + if(((blockid < block_count) && (blockid >= 0)) || (blockid == CUPRINTF_UNRESTRICTED)) + restrictRules.blockid = blockid; +} + + +/////////////////////////////////////////////////////////////////////////////// +// HOST SIDE + +#include <stdio.h> +static FILE *printf_fp; + +static char *printfbuf_start=NULL; +static char *printfbuf_device=NULL; +static int printfbuf_len=0; + + +// +// outputPrintfData +// +// Our own internal function, which takes a pointer to a data buffer +// and passes it through libc's printf for output. +// +// We receive the formate string and a pointer to where the data is +// held. We then run through and print it out. +// +// Returns 0 on failure, 1 on success +// +static int outputPrintfData(char *fmt, char *data) +{ + // Format string is prefixed by a length that we don't need + fmt += CUPRINTF_ALIGN_SIZE; + + // Now run through it, printing everything we can. We must + // run to every % character, extract only that, and use printf + // to format it. + char *p = strchr(fmt, '%'); + while(p != NULL) + { + // Print up to the % character + *p = '\0'; + fputs(fmt, printf_fp); + *p = '%'; // Put back the % + + // Now handle the format specifier + char *format = p++; // Points to the '%' + p += strcspn(p, "%cdiouxXeEfgGaAnps"); + if(*p == '\0') // If no format specifier, print the whole thing + { + fmt = format; + break; + } + + // Cut out the format bit and use printf to print it. It's prefixed + // by its length. + int arglen = *(int *)data; + if(arglen > CUPRINTF_MAX_LEN) + { + fputs("Corrupt printf buffer data - aborting\n", printf_fp); + return 0; + } + + data += CUPRINTF_ALIGN_SIZE; + + char specifier = *p++; + char c = *p; // Store for later + *p = '\0'; + switch(specifier) + { + // These all take integer arguments + case 'c': + case 'd': + case 'i': + case 'o': + case 'u': + case 'x': + case 'X': + case 'p': + fprintf(printf_fp, format, *((int *)data)); + break; + + // These all take double arguments + case 'e': + case 'E': + case 'f': + case 'g': + case 'G': + case 'a': + case 'A': + if(arglen == 4) // Float vs. Double thing + fprintf(printf_fp, format, *((float *)data)); + else + fprintf(printf_fp, format, *((double *)data)); + break; + + // Strings are handled in a special way + case 's': + fprintf(printf_fp, format, (char *)data); + break; + + // % is special + case '%': + fprintf(printf_fp, "%%"); + break; + + // Everything else is just printed out as-is + default: + fprintf(printf_fp, format); + break; + } + data += CUPRINTF_ALIGN_SIZE; // Move on to next argument + *p = c; // Restore what we removed + fmt = p; // Adjust fmt string to be past the specifier + p = strchr(fmt, '%'); // and get the next specifier + } + + // Print out the last of the string + fputs(fmt, printf_fp); + return 1; +} + + +// +// doPrintfDisplay +// +// This runs through the blocks of CUPRINTF_MAX_LEN-sized data, calling the +// print function above to display them. We've got this separate from +// cudaPrintfDisplay() below so we can handle the SM_10 architecture +// partitioning. +// +static int doPrintfDisplay(int headings, int clear, char *bufstart, char *bufend, char *bufptr, char *endptr) +{ + // Grab, piece-by-piece, each output element until we catch + // up with the circular buffer end pointer + int printf_count=0; + char printfbuf_local[CUPRINTF_MAX_LEN+1]; + printfbuf_local[CUPRINTF_MAX_LEN] = '\0'; + + while(bufptr != endptr) + { + // Wrap ourselves at the end-of-buffer + if(bufptr == bufend) + bufptr = bufstart; + + // Adjust our start pointer to within the circular buffer and copy a block. + cudaMemcpy(printfbuf_local, bufptr, CUPRINTF_MAX_LEN, cudaMemcpyDeviceToHost); + + // If the magic number isn't valid, then this write hasn't gone through + // yet and we'll wait until it does (or we're past the end for non-async printfs). + cuPrintfHeader *hdr = (cuPrintfHeader *)printfbuf_local; + if((hdr->magic != CUPRINTF_SM11_MAGIC) || (hdr->fmtoffset >= CUPRINTF_MAX_LEN)) + { + //fprintf(printf_fp, "Bad magic number in printf header\n"); + break; + } + + // Extract all the info and get this printf done + if(headings) + fprintf(printf_fp, "[%d, %d]: ", hdr->blockid, hdr->threadid); + if(hdr->fmtoffset == 0) + fprintf(printf_fp, "printf buffer overflow\n"); + else if(!outputPrintfData(printfbuf_local+hdr->fmtoffset, printfbuf_local+sizeof(cuPrintfHeader))) + break; + printf_count++; + + // Clear if asked + if(clear) + cudaMemset(bufptr, 0, CUPRINTF_MAX_LEN); + + // Now advance our start location, because we're done, and keep copying + bufptr += CUPRINTF_MAX_LEN; + } + + return printf_count; +} + + +// +// cudaPrintfInit +// +// Takes a buffer length to allocate, creates the memory on the device and +// returns a pointer to it for when a kernel is called. It's up to the caller +// to free it. +// +extern "C" cudaError_t cudaPrintfInit(size_t bufferLen) +{ + // Fix up bufferlen to be a multiple of CUPRINTF_MAX_LEN + bufferLen = (bufferLen < CUPRINTF_MAX_LEN) ? CUPRINTF_MAX_LEN : bufferLen; + if((bufferLen % CUPRINTF_MAX_LEN) > 0) + bufferLen += (CUPRINTF_MAX_LEN - (bufferLen % CUPRINTF_MAX_LEN)); + printfbuf_len = (int)bufferLen; + + // Allocate a print buffer on the device and zero it + if(cudaMalloc((void **)&printfbuf_device, printfbuf_len) != cudaSuccess) + return cudaErrorInitializationError; + cudaMemset(printfbuf_device, 0, printfbuf_len); + printfbuf_start = printfbuf_device; // Where we start reading from + + // No restrictions to begin with + cuPrintfRestriction restrict; + restrict.threadid = restrict.blockid = CUPRINTF_UNRESTRICTED; + cudaMemcpyToSymbol(restrictRules, &restrict, sizeof(restrict)); + + // Initialise the buffer and the respective lengths/pointers. + cudaMemcpyToSymbol(globalPrintfBuffer, &printfbuf_device, sizeof(char *)); + cudaMemcpyToSymbol(printfBufferPtr, &printfbuf_device, sizeof(char *)); + cudaMemcpyToSymbol(printfBufferLength, &printfbuf_len, sizeof(printfbuf_len)); + + return cudaSuccess; +} + + +// +// cudaPrintfEnd +// +// Frees up the memory which we allocated +// +extern "C" void cudaPrintfEnd() +{ + if(!printfbuf_start || !printfbuf_device) + return; + + cudaFree(printfbuf_device); + printfbuf_start = printfbuf_device = NULL; +} + + +// +// cudaPrintfDisplay +// +// Each call to this function dumps the entire current contents +// of the printf buffer to the pre-specified FILE pointer. The +// circular "start" pointer is advanced so that subsequent calls +// dumps only new stuff. +// +// In the case of async memory access (via streams), call this +// repeatedly to keep trying to empty the buffer. If it's a sync +// access, then the whole buffer should empty in one go. +// +// Arguments: +// outputFP - File descriptor to output to (NULL => stdout) +// showThreadID - If true, prints [block,thread] before each line +// +extern "C" cudaError_t cudaPrintfDisplay(void *outputFP, bool showThreadID) +{ + printf_fp = (FILE *)((outputFP == NULL) ? stdout : outputFP); + + // For now, we force "synchronous" mode which means we're not concurrent + // with kernel execution. This also means we don't need clearOnPrint. + // If you're patching it for async operation, here's where you want it. + bool sync_printfs = true; + bool clearOnPrint = false; + + // Initialisation check + if(!printfbuf_start || !printfbuf_device || !printf_fp) + return cudaErrorMissingConfiguration; + + // To determine which architecture we're using, we read the + // first short from the buffer - it'll be the magic number + // relating to the version. + unsigned short magic; + cudaMemcpy(&magic, printfbuf_device, sizeof(unsigned short), cudaMemcpyDeviceToHost); + + // For SM_10 architecture, we've split our buffer into one-per-thread. + // That means we must do each thread block separately. It'll require + // extra reading. We also, for now, don't support async printfs because + // that requires tracking one start pointer per thread. + if(magic == CUPRINTF_SM10_MAGIC) + { + sync_printfs = true; + clearOnPrint = false; + int blocklen = 0; + char *blockptr = printfbuf_device; + while(blockptr < (printfbuf_device + printfbuf_len)) + { + cuPrintfHeaderSM10 hdr; + cudaMemcpy(&hdr, blockptr, sizeof(hdr), cudaMemcpyDeviceToHost); + + // We get our block-size-step from the very first header + if(hdr.thread_buf_len != 0) + blocklen = hdr.thread_buf_len; + + // No magic number means no printfs from this thread + if(hdr.magic != CUPRINTF_SM10_MAGIC) + { + if(blocklen == 0) + { + fprintf(printf_fp, "No printf headers found at all!\n"); + break; // No valid headers! + } + blockptr += blocklen; + continue; + } + + // "offset" is non-zero then we can print the block contents + if(hdr.offset > 0) + { + // For synchronous printfs, we must print from endptr->bufend, then from start->end + if(sync_printfs) + doPrintfDisplay(showThreadID, clearOnPrint, blockptr+CUPRINTF_MAX_LEN, blockptr+hdr.thread_buf_len, blockptr+hdr.offset+CUPRINTF_MAX_LEN, blockptr+hdr.thread_buf_len); + doPrintfDisplay(showThreadID, clearOnPrint, blockptr+CUPRINTF_MAX_LEN, blockptr+hdr.thread_buf_len, blockptr+CUPRINTF_MAX_LEN, blockptr+hdr.offset+CUPRINTF_MAX_LEN); + } + + // Move on to the next block and loop again + blockptr += hdr.thread_buf_len; + } + } + // For SM_11 and up, everything is a single buffer and it's simple + else if(magic == CUPRINTF_SM11_MAGIC) + { + // Grab the current "end of circular buffer" pointer. + char *printfbuf_end = NULL; + cudaMemcpyFromSymbol(&printfbuf_end, printfBufferPtr, sizeof(char *)); + + // Adjust our starting and ending pointers to within the block + char *bufptr = ((printfbuf_start - printfbuf_device) % printfbuf_len) + printfbuf_device; + char *endptr = ((printfbuf_end - printfbuf_device) % printfbuf_len) + printfbuf_device; + + // For synchronous (i.e. after-kernel-exit) printf display, we have to handle circular + // buffer wrap carefully because we could miss those past "end". + if(sync_printfs) + doPrintfDisplay(showThreadID, clearOnPrint, printfbuf_device, printfbuf_device+printfbuf_len, endptr, printfbuf_device+printfbuf_len); + doPrintfDisplay(showThreadID, clearOnPrint, printfbuf_device, printfbuf_device+printfbuf_len, bufptr, endptr); + + printfbuf_start = printfbuf_end; + } + else + ;//printf("Bad magic number in cuPrintf buffer header\n"); + + // If we were synchronous, then we must ensure that the memory is cleared on exit + // otherwise another kernel launch with a different grid size could conflict. + if(sync_printfs) + cudaMemset(printfbuf_device, 0, printfbuf_len); + + return cudaSuccess; +} + +// Cleanup +#undef CUPRINTF_MAX_LEN +#undef CUPRINTF_ALIGN_SIZE +#undef CUPRINTF_SM10_MAGIC +#undef CUPRINTF_SM11_MAGIC + +#endif |