; Chunky2Planar algorithm, originally by James McCoull
; Modified by Peter McGavin for variable size and depth
; and "dirty list" (hope I didn't slow it down too much)
;
; 	Cpu only solution
;	Optimised for 020+fastram
;	Aim for less than 90ms for 320x200x256 on 14MHz 020

;void __asm c2p_8 (register __a0 UBYTE *chunky_data,
;                  register __a1 PLANEPTR raster,
;                  register __a2 UBYTE *dirty_list,
;                  register __d1 ULONG plsiz,
;                  register __a5 UBYTE *tmp_buffer);

; a0 -> width*height chunky pixels in fastmem
; a1 -> contiguous bitplanes in chipmem
; a2 -> dirty list (1-byte flag for whether each 32 pixel "unit" needs updating)
; d1 = width*height/8   (width*height must be a multiple of 32)
; a5 -> width*height tmp buffer in fastmem

	ifeq	depth-8
		xdef	_c2p_8_020
_c2p_8_020:
	else
	ifeq	depth-6
		xdef	_c2p_6_020
_c2p_6_020:
	else
	ifeq	depth-4
		xdef	_c2p_4_020
_c2p_4_020:
	else
		fail	"unsupported depth!"
	endc
	endc
	endc


wordmerge	macro		; i1 i2 tmp
				; \1 \2 \3
		move.l	\2,\3		;\3 = CD
		move.w	\1,\2		;\2 = CB
		swap	\2		;\2 = BC
		move.w	\2,\1		;\1 = AC
		move.w	\3,\2		;\2 = BD
		endm


merge		macro		; io  in out tmp msk sft
				; \1  \2  \3  \4  \5  \6
				; \1 = abqr
				; \2 = ijyz
		move.l	\5,\3	; \3 = 0x0x
		move.l	\3,\4	; \4 = 0x0x
		and.l	\1,\3	; \3 = 0b0r
		and.l	\2,\4	; \4 = 0j0z
		eor.l	\3,\1	; \1 = a0q0
		eor.l	\4,\2	; \2 = i0y0
		lsr.l	#\6,\2	; \2 = 0i0y
	ifeq \6-1
		add.l	\3,\3
	else
		lsl.l	#\6,\3	; \3 = b0r0
	endc
		or.l	\2,\1	; \1 = aiqy
		or.l	\4,\3	; \3 = bjrz
		endm

merge4		macro		; io  in out tmp msk
				; \1  \2  \3  \4  \5
				; \1 = abqr
				; \2 = ijyz
	ifgt depth-4
		move.l	\5,\3	; \3 = 0x0x
		move.l	\3,\4	; \4 = 0x0x
		and.l	\1,\3	; \3 = 0b0r
		and.l	\2,\4	; \4 = 0j0z
		eor.l	\3,\1	; \1 = a0q0
		eor.l	\4,\2	; \2 = i0y0
		lsr.l	#4,\2	; \2 = 0i0y
		or.l	\2,\1	; \1 = aiqy
		move.l	\1,(a5)+	; write to tmp buffer
		lsl.l	#4,\3	; \3 = b0r0
		or.l	\4,\3	; \3 = bjrz
		move.l	\3,(a5)+	; write to tmp buffer
	else
		move.l	\5,\3	; this version returns only 1 result
		and.l	\3,\2	; \2 = 0j0z
		and.l	\1,\3	; \3 = 0b0r
		lsl.l	#4,\3	; \3 = b0r0
		or.l	\2,\3	; \3 = bjrz
		move.l	\3,(a5)+	; write to tmp buffer
	endc
		endm

merge1		macro		; io  in out tmp msk flg
				; \1  \2  \3  \4  \5  \6
				; \1 = abqr
				; \2 = ijyz
		move.l	\5,\3	; \3 = 0x0x
		move.l	\3,\4	; \4 = 0x0x
		and.l	\1,\3	; \3 = 0b0r
		and.l	\2,\4	; \4 = 0j0z
		eor.l	\3,\1	; \1 = a0q0
		eor.l	\4,\2	; \2 = i0y0
		lsr.l	#1,\2	; \2 = 0i0y
		or.l	\2,\1	; \1 = aiqy
		move.l	\1,(a2)		; write to output plane
		suba.l	a5,a2		; -plsiz
		add.l	\3,\3	; \3 = b0r0
		or.l	\4,\3	; \3 = bjrz
	ifne \6
		move.l	\3,(a2)		; write to output plane
		suba.l	a5,a2		; -plsiz
	endc
		endm


start:		jmp	next		; self-modified code here
next:		movem.l	d1/a0-a1/a6,-(sp)

; Relocate c2p so that firstsweep2 is at a quad-longword-aligned address.
; Firstsweep2 loop doesn't fit in '020/'030 cache unless it is exactly aligned.
; Speed penalty of misalignment is about 30%.

		lea	(firstsweep2,pc),a0
		move.l	a0,d0
		and.w	#%00001111,d0	; relocate by -d0.w bytes

		lea	(c2p,pc),a0	; a0 = src
		movea.l	a0,a1
		sub.w	d0,a1		; a1 = dst

		move.l	a1,start+2	; patch jmp

		move.w	#(end-c2p)/2-1,d0
loop:		move.w	(a0)+,(a1)+	; relocate code loop
		dbra	d0,loop

		move.l	(4).w,a6	; flush cache
		jsr	(_LVOCacheClearU,a6)

		movem.l	(sp)+,d1/a0-a1/a6
		bra.b	start		; restart

		ds.w	8		; space for relocation of c2p routine

; the real c2p routine starts here
c2p:
		movem.l	d2-d7/a2-a6,-(sp)

		sub.w	#24,sp		; space for temporary variables

; a0 = chunky buffer
; a1 = output area
; a2 = dirty list
; d1 = plsiz
; a5 = tmp buffer

		move.l	a1,(4,sp)	; save output address
		move.l	a2,(8,sp)	; save dirty list ptr
		move.l	d1,(12,sp)	; save plsiz
		lsl.l	#3,d1
		movea.l	a0,a1
		adda.l	d1,a1		; a1 -> end of chunky buffer
		sub.l	(12,sp),d1
	ifle depth-6
		sub.l	(12,sp),d1
		sub.l	(12,sp),d1
	endc
	ifle depth-4
		sub.l	(12,sp),d1
		sub.l	(12,sp),d1
	endc
		move.l	d1,(16,sp)	; save 7*plsiz (or 5*plsiz) (or 3*plsiz)
		move.l	a5,(20,sp)	; save tmp buffer address

;; Sweep thru the whole chunky data once,
;; Performing 3 merge operations on it.

		move.l	#$00ff00ff,a3	; load byte merge mask
		move.l	#$0f0f0f0f,a4	; load nibble merge mask

; pass 1
firstsweep:	tst.b	(a2)+		; does next 32 pixel unit need updating?
		bne.b	firstsweep3

		adda.w	#32,a0		; skip 32 pixels on input/output

		cmpa.l	a0,a1
		bne.b	firstsweep
		bra.w	exit		; exit if no changes

; this becomes the first sweep's main loop after the first change is found
firstsweep2:	tst.b	(a2)+		; does next 32 pixel unit need updating?
		bne.b	firstsweep3

		adda.w	#32,a0		; skip 32 pixels on input

		cmpa.l	a0,a1
		bne.b	firstsweep2
		bra.w	secondsweep	; on to second sweep if changes

firstsweep3:
		movem.l	(a0)+,d0-d7	; get 32 pixels in registers
; d0-7 = abcd efgh ijkl mnop qrst uvwx yzAB CDEF

		wordmerge	d0,d4,a6	;d0/4 = abqr cdst
		wordmerge	d1,d5,a6	;d1/5 = efuv ghwx
		wordmerge	d2,d6,a6	;d2/6 = ijyz klAB
		wordmerge	d3,d7,a6 	;d3/7 = mnCD opEF

; temporarily save off some registers
		movea.l	d7,a6
		move.l	d6,(sp)

; pass 2
		merge	d0,d2,d6,d7,a3,8	;d0/d6 = aiqy bjrz
		merge	d1,d3,d7,d2,a3,8	;d1/d7 = emuc fnvD

; pass 3
		merge4	d0,d1,d2,d3,a4,4	;d0/d2  = ae74... ae30...
		merge4	d6,d7,d3,d1,a4,4	;d6/d3  = bf74... bf30...

; bring them back
		move.l	a6,d7
		move.l	(sp),d6

; pass 2
		merge	d4,d6,d0,d1,a3,8	;d4/d0 = cksA dltB
		merge	d5,d7,d1,d6,a3,8	;d5/d1 = gowE hpxF

; pass 3
		merge4	d4,d5,d6,d7,a4,4	;d4/d6 = cg74.. cg30..
		merge4	d0,d1,d7,d5,a4,4	;d0/d7 = dh74.. dh30..

		cmpa.l	a0,a1
		bne.w	firstsweep2		; end of firstsweep, 250 bytes
						; only just fits in instr cache

; (a0) 	ae74.. ae30.. bf74.. bf30.. cg74.. cg30.. dh74.. dh30..

secondsweep:
		movea.l	a5,a1			; a1 -> end of tmp buffer
		movea.l	(4,sp),a2		; a2 -> plane0
		movea.l	(8,sp),a6		; a6 -> dirty list
		movea.l	(12,sp),a5		; a5 = plsiz
		adda.l	(16,sp),a2		; a2 -> plane7
		movea.l	(20,sp),a0		; a0 -> tmp buffer

		movea.l	#$33333333,a3
		movea.l	#$55555555,a4

		bra.b	secondsweep2

secondsweep3:	addq.l	#4,a2		; skip 32 pixels on output

secondsweep2:	tst.b	(a6)+		; does next 32 pixel unit need updating?
		beq.b	secondsweep3

	ifgt depth-4

		movem.l	(a0)+,d0-d6		; read tmp buffer, not d7 yet

; save d5 temporarily
		move.l	d5,(sp)

;; pass 4
		merge	d0,d4,d5,d7,a3,2	; d0/d5 = aceg76.. aceg54..
		merge	d2,d6,d7,d4,a3,2	; d2/d7 = bdhf76.. bdhf54..

;; pass 5
	ifgt depth-6
		merge1	d0,d2,d4,d6,a4,1	; d0/d4 = abcd7... abcd6...
	endc
		merge1	d5,d7,d6,d2,a4,1	; d5/d6 = abcd5... abcd4...


; restore d5 and finally get d7
		move.l	(sp),d5
		move.l	(a0)+,d7

	else

		movem.l	(a0)+,d1/d3/d5/d7	; read tmp buf, depth 4 version

	endc

;; pass 4
		merge	d1,d5,d4,d6,a3,2	; d1/d4 = aceg32.. aceg10..
		merge	d3,d7,d6,d5,a3,2	; d3/d6 = bdhf32.. bdhf10..

;; pass 5
		merge1	d1,d3,d5,d7,a4,1	; d1/d5 = abcd3... abcd2...
		merge1	d4,d6,d7,d3,a4,0	; d4/d7 = abcd1... abcd0...

		move.l	d7,(a2)+		; plane 0
		adda.l	(16,sp),a2		; +7*plsiz (or 5*plsiz) (or 3*plsiz)

		cmp.l	a0,a1
		bne.w	secondsweep2		; end of secondsweep, 216 bytes

exit:
		add.w	#24,sp
		movem.l	(sp)+,d2-d7/a2-a6
		rts
end:
		end
