#include <stdio.h>
#include <graphic_driver.h>
#include <OS.h>
#include <KernelExport.h>
#include <malloc.h>

#include "nv_globals.h"
#include "nv_3da.h"
#include "riva_glh.h"
#include "riva_symbols.h"
#include <dirent.h>
#include <string.h>

static status_t init_common(int the_fd);
static void uninit_common(void);

static int pick_device(const char *apath)
{
	DIR				*d;
	struct dirent	*e;
	char name_buf[1024];
	int fd = -1;

	/* open directory apath */
	d = opendir(apath);
	if (!d) return B_ERROR;
	/* get a list of devices */
	while ((e = readdir(d)) != NULL)
	{
		/* only accept nVidia kerneldriver */
		if (strncmp(e->d_name, "10de", 4)) continue;
		strcpy(name_buf, apath);
		strcat(name_buf, "/");
		strcat(name_buf, e->d_name);
		fprintf(stderr, "Init3DA: trying kerneldriver %s\n", name_buf);
		fd = open(name_buf, B_READ_WRITE);
		if (fd >= 0) break;
	}
	closedir(d);
	return fd;
}

status_t get_clone_nr()
{
	uint8 cnt;
	uint32 tmp_nr = 0x00000001;
	clone_nr = 0x00000000;

	for (cnt = 0; cnt < 32; cnt++)
	{
		if (!(si->engine.threeD.clones & tmp_nr))
		{
			clone_nr = tmp_nr;
			return B_OK;
		}
		else
		{
			tmp_nr <<= 1;
		}
	}

	return B_ERROR;
}

status_t init_3da(void)
{
	int the_fd;
	status_t result;
	char signature[1024];
		
	/* find a graphic device to open */
	the_fd = pick_device("/dev/graphics");
	if (fd < 0)
	{
		fprintf(stderr, "Init3DA: Can't open kerneldriver!\n");
		return fd;
	}

	result = ioctl(the_fd, B_GET_ACCELERANT_SIGNATURE, &signature, sizeof(signature));
	if (result != B_OK)
	{
		close(fd);
		return result;
	}

	/* do the initialization common to both the primary and the clones */
	result = init_common(the_fd);

	/* bail out if the common initialization failed */
	if (result != B_OK)
	{
		close(fd);
		return result;
	}

	/* register ourselves as a 3D clone accelerant */
	AQUIRE_BEN(si->engine.lock)
	if (get_clone_nr() == B_OK)
	{
		si->engine.threeD.clones |= clone_nr;
		LOG(2,("Init3DA: Inited OK, 3D clone number is $%08x!\n", clone_nr));
	}
	else
	{
		LOG(2,("Init3DA: No more room for a 3D clone, closing down!\n"));
		result = B_ERROR;
	}
	RELEASE_BEN(si->engine.lock)

	if (si->dm.space == B_CMAP8)
	{
		LOG(2,("Init3DA: 8-bit colormode not supported, closing down!\n"));
		result = B_ERROR;
	}

	if (si->ps.card_arch >= NV20A)
	{
		LOG(2,("Init3DA: NV20 and higher architectures not supported, closing down!\n"));
		result = B_ERROR;
	}

	if (result != B_OK) uninit_3da();

	return result;
}

status_t uninit_3da(void)
{
	LOG(2,("Uninit3DA: Shutting down 3Daccelerant.\n"));

	/* remove ourselves to the number of active 3D clone accelerants */
	AQUIRE_BEN(si->engine.lock)
	si->engine.threeD.clones &= ~clone_nr;
	RELEASE_BEN(si->engine.lock)

	uninit_common();
	close(fd);
	return B_OK;
}

/* Initialization code shared between primary and cloned accelerants */
static status_t init_common(int the_fd) {
	status_t result;
	nv_get_private_data gpd;
	
	// LOG not available from here to next LOG: NULL si

	/* memorize the file descriptor */
	fd = the_fd;
	/* set the magic number so the driver knows we're for real */
	gpd.magic = NV_PRIVATE_DATA_MAGIC;
	/* contact driver and get a pointer to the registers and shared data */
	result = ioctl(fd, NV_GET_PRIVATE_DATA, &gpd, sizeof(gpd));
	if (result != B_OK) goto error0;

	/* clone the shared area for our use */
	shared_info_area = clone_area(DRIVER_PREFIX " shared", (void **)&si, B_ANY_ADDRESS,
		B_READ_AREA | B_WRITE_AREA, gpd.shared_info_area);
	if (shared_info_area < 0) {
			result = shared_info_area;
			goto error0;
	}
	// LOG is now available, si !NULL
	LOG(4,("init_common: logmask 0x%08x, memory %dMB, hardcursor %d, usebios %d, switchhead %d, force_pci %d\n",
		si->settings.logmask, si->settings.memory, si->settings.hardcursor, si->settings.usebios, si->settings.switchhead, si->settings.force_pci));
	LOG(4,("init_common: dumprom %d, unhide_fw %d, pgm_panel %d, dma_acc %d\n",
		si->settings.dumprom, si->settings.unhide_fw, si->settings.pgm_panel, si->settings.dma_acc));

 	/*Check for R4.5.0 and if it is running, use work around*/
 	{
 		if (si->use_clone_bugfix)
 		{
 			/*check for R4.5.0 bug and attempt to work around*/
 			LOG(2,("Init3DA: Found R4.5.0 bug - attempting to work around\n"));
 			regs = si->clone_bugfix_regs;
 		}
 		else
 		{
			/* clone the memory mapped registers for our use  - does not work on <4.5.2 (but is better this way)*/
			regs_area = clone_area(DRIVER_PREFIX " regs", (void **)&regs, B_ANY_ADDRESS,
				B_READ_AREA | B_WRITE_AREA, si->regs_area);
			if (regs_area < 0) {
				result = regs_area;
				goto error1;
			}
 		}
 	}

	/* all done */
	goto error0;

error1:
	delete_area(shared_info_area);
error0:
	return result;
}

/* Clean up code shared between primary and cloned accelrants */
static void uninit_common(void) {
	/* release the memory mapped registers */
	delete_area(regs_area);
	/* a little cheap paranoia */
	regs = 0;
	/* release our copy of the shared info from the kernel driver */
	delete_area(shared_info_area);
	/* more cheap paranoia */
	si = 0;
}

void *get_framebuffer(void)
{
//	LOG(2,("get_framebuffer called, returning $%08lx\n", (uint32)(si->fbc.frame_buffer)));
	return si->fbc.frame_buffer;
}

void get_dm(display_mode *dm)
{
	dm = &(si->dm);
}

/* wait until engine completely idle */
status_t nv_acc_wait_idle_dma()
{
	/* we'd better check for timeouts on the DMA engine as it's theoretically
	 * breakable by malfunctioning software */
	uint16 cnt = 0;

	/* wait until all upcoming commands are in execution at least. Do this until
	 * we hit a timeout; abort if we failed at least three times before:
	 * if DMA stalls, we have to forget about it alltogether at some point, or
	 * the system will almost come to a complete halt.. */
	/* note:
	 * it doesn't matter which FIFO channel's DMA registers we access, they are in
	 * fact all the same set. It also doesn't matter if the channel was assigned a
	 * command or not. */
	while ((NV_REG32(NVACC_FIFO + NV_GENERAL_DMAGET) != (si->engine.dma.put << 2)) &&
			(cnt < 10000) && (err < 3))
	{
		/* snooze a bit so I do not hammer the bus */
		snooze (100);
		cnt++;
	}

	/* log timeout if we had one */
	if (cnt == 10000)
	{
		if (err < 3) err++;
		LOG(4,("ACC_DMA: wait_idle; DMA timeout #%d, engine trouble!\n", err));
	}

	/* wait until execution completed */
	while (ACCR(STATUS))
	{
		/* snooze a bit so I do not hammer the bus */
		snooze (100);
	}

	return B_OK;
}

void nv_start_dma(void)
{
	uint32 dummy;

	if (si->engine.dma.current != si->engine.dma.put)
	{
		si->engine.dma.put = si->engine.dma.current;
		/* flush used caches so we know for sure the DMA cmd buffer received all data. */
		/* some CPU's support out-of-order processing (WinChip/Cyrix). Flush them. */
		__asm__ __volatile__ ("lock; addl $0,0(%%esp)": : :"memory");
		/* read a non-cached adress to flush the cash */
		dummy = ACCR(STATUS);

		/* actually start DMA to execute all commands now in buffer */
		/* note:
		 * it doesn't matter which FIFO channel's DMA registers we access, they are in
		 * fact all the same set. It also doesn't matter if the channel was assigned a
		 * command or not. */
		/* note also:
		 * NV_GENERAL_DMAPUT is a write-only register on some cards (confirmed NV11). */
		NV_REG32(NVACC_FIFO + NV_GENERAL_DMAPUT) = (si->engine.dma.put << 2);
	}
}

/* this routine does not check the engine's internal hardware FIFO, but the DMA
 * command buffer. You can see this as a FIFO as well, that feeds the hardware FIFO.
 * The hardware FIFO state is checked by the DMA hardware automatically. */
status_t nv_acc_fifofree_dma(uint16 cmd_size)
{
	uint32 dmaget;

	/* we'd better check for timeouts on the DMA engine as it's theoretically
	 * breakable by malfunctioning software */
	uint16 cnt = 0;

	/* check if the DMA buffer has enough room for the command.
	 * note:
	 * engine.dma.free is 'cached' */
	while ((si->engine.dma.free < cmd_size) && (cnt < 10000) && (err < 3))
	{
		/* see where the engine is currently fetching from the buffer */
		/* note:
		 * read this only once in the code as accessing registers is relatively slow */
		/* note also:
		 * it doesn't matter which FIFO channel's DMA registers we access, they are in
		 * fact all the same set. It also doesn't matter if the channel was assigned a
		 * command or not. */
		dmaget = ((NV_REG32(NVACC_FIFO + NV_GENERAL_DMAGET)) >> 2);

		/* update timeout counter: on NV11 on a Pentium4 2.8Ghz max reached count
		 * using BeRoMeter 1.2.6 was about 600; so counting 10000 before generating
		 * a timeout should definately do it. Snooze()-ing cannot be done without a
		 * serious speed penalty, even if done for only 1 microSecond. */
		cnt++;

		/* where's the engine fetching viewed from us issuing? */
		if (si->engine.dma.put >= dmaget)
		{
			/* engine is fetching 'behind us', the last piece of the buffer is free */

			/* note the 'updated' free space we have in the DMA buffer */
			si->engine.dma.free = si->engine.dma.max - si->engine.dma.current;
			/* if it's enough after all we exit this routine immediately. Else: */
			if (si->engine.dma.free < cmd_size)
			{
				/* not enough room left, so instruct DMA engine to reset the buffer
				 * when it's reaching the end of it */
				((uint32*)(si->dma_buffer))[si->engine.dma.current++] = 0x20000000;
				/* reset our buffer pointer, so new commands will be placed at the
				 * beginning of the buffer. */
				si->engine.dma.current = 0;
				/* tell the engine to fetch the remaining command(s) in the DMA buffer
				 * that where not executed before. */
				nv_start_dma();

				/* NOW the engine is fetching 'in front of us', so the first piece
				 * of the buffer is free */

				/* note the updated current free space we have in the DMA buffer */
				si->engine.dma.free = dmaget - si->engine.dma.current;
				/* mind this pittfall:
				 * Leave some room between where the engine is fetching and where we
				 * put new commands. Otherwise the engine will crash on heavy loads.
				 * A crash can be forced best in 640x480x32 mode with BeRoMeter 1.2.6.
				 * (confirmed on NV11 and NV43 with less than 256 words forced freespace.)
				 * Note:
				 * The engine is DMA triggered for fetching chunks every 128 bytes,
				 * maybe this is the reason for this behaviour.
				 * Note also:
				 * it looks like the space that needs to be kept free is coupled
				 * with the size of the DMA buffer. */
				if (si->engine.dma.free < 256)
					si->engine.dma.free = 0;
				else
					si->engine.dma.free -= 256;
			}
		}
		else
		{
			/* engine is fetching 'in front of us', so the first piece of the buffer
			 * is free */

			/* note the updated current free space we have in the DMA buffer */
			si->engine.dma.free = dmaget - si->engine.dma.current;
			/* mind this pittfall:
			 * Leave some room between where the engine is fetching and where we
			 * put new commands. Otherwise the engine will crash on heavy loads.
			 * A crash can be forced best in 640x480x32 mode with BeRoMeter 1.2.6.
			 * (confirmed on NV11 and NV43 with less than 256 words forced freespace.)
			 * Note:
			 * The engine is DMA triggered for fetching chunks every 128 bytes,
			 * maybe this is the reason for this behaviour.
			 * Note also:
			 * it looks like the space that needs to be kept free is coupled
			 * with the size of the DMA buffer. */
			if (si->engine.dma.free < 256)
				si->engine.dma.free = 0;
			else
				si->engine.dma.free -= 256;

			/* tell the engine to fetch the remaining command(s) in the DMA buffer
			 * that where not executed before. */
			nv_start_dma();
		}
	}

	/* log timeout if we had one */
	if (cnt == 10000)
	{
		if (err < 3) err++;
		LOG(4,("ACC_DMA: fifofree; DMA timeout #%d, engine trouble!\n", err));
	}

	/* we must make the acceleration routines abort or the driver will hang! */
	if (err >= 3) return B_ERROR;

	return B_OK;
}

void nv_acc_cmd_dma(uint32 cmd, uint16 offset, uint16 size)
{
	/* NV_FIFO_DMA_OPCODE: set number of cmd words (b18 - 28); set FIFO offset for
	 * first cmd word (b2 - 15); set DMA opcode = method (b29 - 31).
	 * a 'NOP' is the opcode word $00000000. */
	/* note:
	 * possible DMA opcodes:
	 * b'000' is 'method' (execute cmd);
	 * b'001' is 'jump';
	 * b'002' is 'noninc method' (execute buffer wrap-around);
	 * b'003' is 'call': return is executed by opcode word $00020000 (b17 = 1). */
	/* note also:
	 * this system uses auto-increments for the FIFO offset adresses. Make sure
	 * to set a new adress if a gap exists between the previous one and the new one. */
	((uint32*)(si->dma_buffer))[si->engine.dma.current++] = ((size << 18) |
		((si->engine.fifo.ch_ptr[cmd] + offset) & 0x0000fffc));

	/* space left after issuing the current command is the cmd AND it's arguments less */
	si->engine.dma.free -= (size + 1);
}

void nv_acc_set_ch_dma(uint16 ch, uint32 handle)
{
	/* issue FIFO channel assign cmd */
	((uint32*)(si->dma_buffer))[si->engine.dma.current++] = ((1 << 18) | ch);
	/* set new assignment */
	((uint32*)(si->dma_buffer))[si->engine.dma.current++] = (0x80000000 | handle);

	/* space left after issuing the current command is the cmd AND it's arguments less */
	si->engine.dma.free -= 2;
}

//fixme: nolonger relying on switching FIFO assignment for NV4_CONTEXT_SURFACES_ARGB_ZS
//as doing that causes trouble when overlay is concurrently active!!!!
//we can forego switching for now as we had FIFO CH6 still unused...
//(note: switching has no noticable slowdown: measured 0.2% with Quake2)
void nv_acc_assert_fifo_dma(void)
{
	/* does every engine cmd this accelerant needs have a FIFO channel? */
	//fixme: can probably be optimized for both speed and channel selection...
	if (!si->engine.fifo.ch_ptr[NV_ROP5_SOLID] ||
		!si->engine.fifo.ch_ptr[NV_IMAGE_BLACK_RECTANGLE] ||
		!si->engine.fifo.ch_ptr[NV4_SURFACE] ||
		!si->engine.fifo.ch_ptr[NV_IMAGE_BLIT] ||
		!si->engine.fifo.ch_ptr[NV4_CONTEXT_SURFACES_ARGB_ZS] ||
		!si->engine.fifo.ch_ptr[NV4_DX5_TEXTURE_TRIANGLE])
	{
		LOG(2,("assert_fifo failed..\n"));

		uint16 cnt;

		/* free the FIFO channels we want from the currently assigned cmd's */
		si->engine.fifo.ch_ptr[si->engine.fifo.handle[0]] = 0;
		si->engine.fifo.ch_ptr[si->engine.fifo.handle[1]] = 0;
		si->engine.fifo.ch_ptr[si->engine.fifo.handle[3]] = 0;
		si->engine.fifo.ch_ptr[si->engine.fifo.handle[4]] = 0;
		si->engine.fifo.ch_ptr[si->engine.fifo.handle[6]] = 0;
		si->engine.fifo.ch_ptr[si->engine.fifo.handle[7]] = 0;

		/* set new object handles */
		si->engine.fifo.handle[0] = NV_ROP5_SOLID;
		si->engine.fifo.handle[1] = NV_IMAGE_BLACK_RECTANGLE;
		si->engine.fifo.handle[3] = NV4_SURFACE;
		si->engine.fifo.handle[4] = NV_IMAGE_BLIT;
		si->engine.fifo.handle[6] = NV4_CONTEXT_SURFACES_ARGB_ZS;
		si->engine.fifo.handle[7] = NV4_DX5_TEXTURE_TRIANGLE;

		/* set handle's pointers to their assigned FIFO channels */
		/* note:
		 * b0-1 aren't used as adressbits. Using b0 to indicate a valid pointer. */
		for (cnt = 0; cnt < 0x08; cnt++)
		{
			si->engine.fifo.ch_ptr[(si->engine.fifo.handle[cnt])] =
				(0x00000001 + (cnt * 0x00002000));
		}

		/* wait for room in fifo for new FIFO assigment cmds if needed. */
		if (nv_acc_fifofree_dma(12) != B_OK) return;

		/* program new FIFO assignments */
		/* Raster OPeration: */
		nv_acc_set_ch_dma(NV_GENERAL_FIFO_CH0, si->engine.fifo.handle[0]);
		/* Clip: */
		nv_acc_set_ch_dma(NV_GENERAL_FIFO_CH1, si->engine.fifo.handle[1]);
		/* 2D surfaces: */
		nv_acc_set_ch_dma(NV_GENERAL_FIFO_CH3, si->engine.fifo.handle[3]);
		/* retrace sync and blit: */
		nv_acc_set_ch_dma(NV_GENERAL_FIFO_CH4, si->engine.fifo.handle[4]);
		/* surface */
		nv_acc_set_ch_dma(NV_GENERAL_FIFO_CH6, si->engine.fifo.handle[6]);
		/* Textured Triangle */
		nv_acc_set_ch_dma(NV_GENERAL_FIFO_CH7, si->engine.fifo.handle[7]);
	}
}
