; Chunky2Planar algorithm, originally by James McCoull
; Modified by Peter McGavin for variable size and depth
; and "dirty list" (hope I didn't slow it down too much)
;
; 	Cpu only solution VERSION 2
;	Optimised for 040+fastram
;	bitplanes are assumed contiguous!
;	analyse instruction offsets to check performance

;void __asm c2p_8_040 (register __a0 UBYTE *chunky_data,
;                      register __a1 PLANEPTR raster,
;                      register __a2 UBYTE *dirty_list,
;                      register __d1 ULONG plsiz);

; a0 -> width*height chunky pixels
; a1 -> contiguous bitplanes
; a2 -> dirty list (1-byte flag for whether each 32 pixel "unit" needs updating)
; d1 = width*height/8   (width*height must be a multiple of 32)

	ifeq	depth-8
		xdef	_c2p_8_040
_c2p_8_040:
	else
	ifeq	depth-6
		xdef	_c2p_6_040
_c2p_6_040:
	else
	ifeq	depth-4
		xdef	_c2p_4_040
_c2p_4_040:
	else
		fail	"unsupported depth!"
	endc
	endc
	endc

merge		macro ; in1,in2,tmp3,tmp4,mask,shift
; \1 = abqr
; \2 = ijyz
		move.l	\2,\4
		move.l	#\5,\3
		and.l	\3,\2	; \2 = 0j0z
		and.l	\1,\3	; \3 = 0b0r
		eor.l	\3,\1	; \1 = a0q0
		eor.l	\2,\4	; \4 = i0y0
		ifeq	\6-1
		add.l	\3,\3
		else
		lsl.l	#\6,\3	; \3 = b0r0
		endc
		lsr.l	#\6,\4	; \4 = 0i0y
		or.l	\3,\2	; \2 = bjrz
		or.l	\4,\1	; \1 = aiqy
		endm

merge4		macro ; in1,in2,tmp3,tmp4,mask
	ifgt depth-4
		merge	\1,\2,\3,\4,\5,4
	else
		move.l	#\5,\3	; \3 = mask
		and.l	\3,\2	; \2 = 0j0z
		and.l	\1,\3	; \3 = 0b0r
		lsl.l	#4,\3	; \3 = b0r0
		or.l	\3,\2	; \2 = bjrz
	endc
		endm


start:
;		jmp	next		; self-modified code here
;next:
;		movem.l	d1/a0-a2,-(sp)
;; relocate c2p to a 16-aligned address
;		lea	(c2p,pc),a0
;		move.l	a0,d0
;		and.b	#%11110000,d0
;		move.l	d0,a1
;
;; patch jmp
;		move.l	d0,start+2
;		move.w	#(end-c2p)-1,d0
;loop:		move.b	(a0)+,(a1)+
;		dbra	d0,loop
;
;; flush cache
;		move.l	(4).w,a6
;		jsr	(_LVOCacheClearU,a6)
;
;; restore parameters and restart
;		movem.l	(sp)+,d1/a0-a2
;		bra.b	start
;
;		ds.w	8		; space for relocation

; the real c2p routine starts here
c2p:
		movem.l	d2-d7/a2-a6,-(sp)

		sub.w	#44,sp		; space for temporary variables

; a0 = chunky buffer
; a1 = output area
; a2 = dirty list
; d1 = plsiz

		movea.l	d1,a3		; a3 = plsiz

		move.l	a0,a4
		lsl.l	#3,d1
		add.l	d1,a4		; a4 -> end of chunky data

first_loop:	tst.l	(a2)+		; do the next 128 pixels need updating?
		bne.b	first_patch	; branch if yes

		adda.w	#128,a0		; skip 128 pixels on input
		adda.w	#16,a1		; skip 128 pixels on output

		cmpa.l	a0,a4
		bcc.b	first_loop
		bra.w	exit		; exit if no changes found

first_patch:	subq.l	#4,a2		; restore input address
		tst.b	(a2)+		; do the next 32 pixels need updating?
		bne.b	first_case	; branch if yes
		adda.w	#32,a0		; skip 32 pixels on input
		addq.l	#4,a1		; skip 32 pixels on output
		tst.b	(a2)+		; do the next 32 pixels need updating?
		bne.b	first_case	; branch if yes
		adda.w	#32,a0		; skip 32 pixels on input
		addq.l	#4,a1		; skip 32 pixels on output
		tst.b	(a2)+		; do the next 32 pixels need updating?
		bne.b	first_case	; branch if yes
		adda.w	#32,a0		; skip 32 pixels on input
		addq.l	#4,a1		; skip 32 pixels on output
		tst.b	(a2)+		; do the next 32 pixels need updating?
		bne.b	first_case	; branch if yes
		adda.w	#32,a0		; skip 32 pixels on input
		addq.l	#4,a1		; skip 32 pixels on output
		bra.b	first_loop	; this should never happen

first_case:	move.l	(0,a0),d1
	 	move.l	(4,a0),d3
		move.l	(8,a0),d0
		move.l	(12,a0),d2
		move.l	(2,a0),d4
	 	move.l	(10,a0),d5
		move.l	(6,a0),d6
		move.l	(14,a0),d7

	 	move.w	(16,a0),d1
	 	move.w	(24,a0),d0
		move.w	(20,a0),d3
		move.w	(28,a0),d2
	 	move.w	(18,a0),d4
	 	move.w	(26,a0),d5
		move.w	(22,a0),d6
		move.w	(30,a0),d7

		adda.w	#32,a0

		move.l	d6,a5
		move.l	d7,a6

		merge	d1,d0,d6,d7,$00ff00ff,8
		merge	d3,d2,d6,d7,$00ff00ff,8

		merge4	d1,d3,d6,d7,$0f0f0f0f,4	
		merge4	d0,d2,d6,d7,$0f0f0f0f,4

		exg	d1,a5
		exg	d0,a6

		merge	d4,d5,d6,d7,$00ff00ff,8
		merge	d1,d0,d6,d7,$00ff00ff,8

		merge4	d4,d1,d6,d7,$0f0f0f0f,4
		merge4	d5,d0,d6,d7,$0f0f0f0f,4

		merge	d3,d1,d6,d7,$33333333,2
		merge	d2,d0,d6,d7,$33333333,2	

		merge	d3,d2,d6,d7,$55555555,1
		merge	d1,d0,d6,d7,$55555555,1

		move.l	d0,(0*4,sp)		;plane0 (movem.l is slower!)
		move.l	d1,(1*4,sp)		;plane1
		move.l	d2,(2*4,sp)		;plane2
		move.l	d3,(3*4,sp)		;plane3

	ifgt depth-4

		move.l	a5,d3
		move.l	a6,d2

		merge	d3,d4,d6,d7,$33333333,2
		merge	d2,d5,d6,d7,$33333333,2

	ifgt depth-6
		merge	d3,d2,d6,d7,$55555555,1
	endc
		merge	d4,d5,d6,d7,$55555555,1

		move.l	d5,(4*4,sp)		;plane4
		move.l	d4,(5*4,sp)		;plane5

	ifgt depth-6
		move.l	d2,(6*4,sp)		;plane6
		move.l	d3,(7*4,sp)		;plane7
	endc

	endc

		move.l	a1,(32,sp)		; save output address
		addq.l	#4,a1			; skip 32 pixels on output

		cmpa.l	a0,a4
		beq.w	final_case


main_loop:	tst.l	(a2)+		; do the next 128 pixels need updating?
		bne.b	main_patch	; branch if yes

		adda.w	#128,a0		; skip 128 pixels on input
		adda.w	#16,a1		; skip 128 pixels on output

		cmpa.l	a0,a4
		bcc.b	main_loop
		bra.w	final_case	; exit if no changes found

main_patch:	subq.l	#4,a2		; restore input address
		tst.b	(a2)+		; do the next 32 pixels need updating?
		beq.b	1$		; branch if no
		bsr.b	main_case
1$:		adda.w	#32,a0		; skip 32 pixels on input
		addq.l	#4,a1		; skip 32 pixels on output
		tst.b	(a2)+		; do the next 32 pixels need updating?
		beq.b	2$		; branch if no
		bsr.b	main_case
2$:		adda.w	#32,a0		; skip 32 pixels on input
		addq.l	#4,a1		; skip 32 pixels on output
		tst.b	(a2)+		; do the next 32 pixels need updating?
		beq.b	3$		; branch if no
		bsr.b	main_case
3$:		adda.w	#32,a0		; skip 32 pixels on input
		addq.l	#4,a1		; skip 32 pixels on output
		tst.b	(a2)+		; do the next 32 pixels need updating?
		beq.b	4$		; branch if no
		bsr.b	main_case
4$:		adda.w	#32,a0		; skip 32 pixels on input
		addq.l	#4,a1		; skip 32 pixels on output
		cmpa.l	a0,a4
		bcc.b	main_loop
		bra.w	final_case	; exit if no changes found

main_case:
		move.l	a1,(36+4,sp)	; save current output address
		move.l	(32+4,sp),a1	; a1 = previous output address

		move.l	(0,a0),d1
	 	move.l	(4,a0),d3
	 	move.l	(8,a0),d0
		move.l	(12,a0),d2
		move.l	(2,a0),d4
	 	move.l	(10,a0),d5
		move.l	(6,a0),d6
		move.l	(14,a0),d7

	 	move.w	(16,a0),d1
	 	move.w	(24,a0),d0
		move.w	(20,a0),d3
		move.w	(28,a0),d2
	 	move.w	(18,a0),d4
	 	move.w	(26,a0),d5
		move.w	(22,a0),d6
		move.w	(30,a0),d7

		move.l	d6,a5
		move.l	d7,a6

		move.l	(0*4+4,sp),(a1)		;plane0
		adda.l	a3,a1			;a1+=plsiz

		merge	d1,d0,d6,d7,$00ff00ff,8
		merge	d3,d2,d6,d7,$00ff00ff,8

		move.l	(1*4+4,sp),(a1)		;plane1
		adda.l	a3,a1			;a1+=plsiz

		merge	d1,d3,d6,d7,$0f0f0f0f,4	
		merge	d0,d2,d6,d7,$0f0f0f0f,4

		exg	d1,a5
		exg	d0,a6

		move.l	(2*4+4,sp),(a1)		;plane2
		adda.l	a3,a1			;a1+=plsiz

		merge	d4,d5,d6,d7,$00ff00ff,8
		merge	d1,d0,d6,d7,$00ff00ff,8

		move.l	(3*4+4,sp),(a1)		;plane3
		adda.l	a3,a1			;a1+=plsiz

		merge	d4,d1,d6,d7,$0f0f0f0f,4
		merge	d5,d0,d6,d7,$0f0f0f0f,4

	ifgt depth-4
		move.l	(4*4+4,sp),(a1)		;plane4
		adda.l	a3,a1			;a1+=plsiz
	endc

		merge	d3,d1,d6,d7,$33333333,2
		merge	d2,d0,d6,d7,$33333333,2	

	ifgt depth-4
		move.l	(5*4+4,sp),(a1)		;plane5
		adda.l	a3,a1			;a1+=plsiz
	endc

		merge	d3,d2,d6,d7,$55555555,1
		merge	d1,d0,d6,d7,$55555555,1

		move.l	d0,(0*4+4,sp)		;plane0 (movem.l is slower!)
		move.l	d1,(1*4+4,sp)		;plane1
		move.l	d2,(2*4+4,sp)		;plane2
		move.l	d3,(3*4+4,sp)		;plane3

	ifgt depth-4
		move.l	a5,d3
		move.l	a6,d2

	ifgt depth-6
		move.l	(6*4+4,sp),(a1)		;plane6
		adda.l	a3,a1			;a1+=plsiz
	endc

		merge	d3,d4,d6,d7,$33333333,2
		merge	d2,d5,d6,d7,$33333333,2

	ifgt depth-6
		move.l	(7*4+4,sp),(a1)		;plane7
		adda.l	a3,a1			;a1+=plsiz
	endc

	ifgt depth-6
		merge	d3,d2,d6,d7,$55555555,1
	endc
		merge	d4,d5,d6,d7,$55555555,1

		move.l	d5,(4*4+4,sp)		;plane4
		move.l	d4,(5*4+4,sp)		;plane5

	ifgt depth-6
		move.l	d2,(6*4+4,sp)		;plane6
		move.l	d3,(7*4+4,sp)		;plane7
	endc

	endc

		movea.l	(36+4,sp),a1	; restore current output address
		move.l	a1,(32+4,sp)	; save output address

		rts


final_case:	move.l	(32,sp),a1	; a1 = previous output address

		move.l	(0*4,sp),(a1)		;plane0
		adda.l	a3,a1			;a1+=plsiz
		move.l	(1*4,sp),(a1)	 	;plane1
		adda.l	a3,a1			;a1+=plsiz
		move.l	(2*4,sp),(a1)		;plane2
		adda.l	a3,a1			;a1+=plsiz
		move.l	(3*4,sp),(a1)		;plane3
	ifgt depth-4
		adda.l	a3,a1			;a1+=plsiz
		move.l	(4*4,sp),(a1)		;plane4	
		adda.l	a3,a1			;a1+=plsiz
		move.l	(5*4,sp),(a1)		;plane5
	ifgt depth-6
		adda.l	a3,a1			;a1+=plsiz
		move.l	(6*4,sp),(a1)		;plane6
		adda.l	a3,a1			;a1+=plsiz
		move.l	(7*4,sp),(a1)		;plane7
	endc
	endc

exit:		add.w	#44,sp
		movem.l	(sp)+,d2-d7/a2-a6
		rts

		cnop	0,4
end:

		end
