4,887,235 513 514 ;; The alu operation depends upon both source and destination bits (defucode ubitblt-short-row-both (read bb-s-word) (assign a-temp (+ bb-width-b bb-d-bitpos)) (if (lesser-or-equal-fixnum a-temp (b-constant 32.)) ;; destination is entirely within one word (sequential (assign byte-s (1- bb-width)) (assign byte-r bb-d-bitpos) (parallel (assign-vma-offset d) (jump bb-byte-alu-operation-dispatch))) ;jcall ;; destination is split across two words (sequential ;; make sure we have write access to the high byte so no pclsr after etoring low (assign-vma-offset d 1) (start-memory read write) ;; store the low byte (assign byte-s (31- bb-d-bitpos)) (assign byte-r bb-d-bitpos) (parallel (assign-vma-offset d) (call bb-byte-alu-operation-dispatch)) ;; store the night byte (assign bb-s-word (rotate bb-s-word byte-r)) (assign byte-s (1- a-temp)) (assign byte-r (b-constant 0)) (parallel (assign-vma-offset d 1) (jump bb-byte-alu-operation-dispatch))))) ;jcall ;(boole fn x y ...) if fn is "abcd" then ; y 0 1 2 3 4 5 6 7 ; | 0 1 0 x*y ~x*y y x*~y x x#y x+y ; ---------- ; 0 | a c 8 9 l0 11 12 13 14 15 ; x | ~(x+y) ~(x#y) ~x ~x+y ~y x+~y ~x+~y -1 ; 1 | b d ;;vma and byte regs have been set up already, for DPB. ;;trashes a-temp-2, b-temp-2. b-temp-3. but not a-temp and b-temp. (defucode bb-byte-alu-operation-dispatch (dispatch-after-this (parallel (start-memory read) (ldb bb-alu-operation 4 0)) (parallel (assign b-temp-3 (dpb bb-s-word byte-s byte-r bb-identity)) (waiting-for-memory)) ((1 2) ;;1 x*y logand ;;2 ~x*y logand (parallel-with-return (parallel (declare-memory-timing data-cycle) (abus-array-data (store-word (logand memory-data b-temp-3)))))) ((4 8.) ;;4 ~(~x+y) = x*~y andc2 ;;8 ~e(x+y) = ~x*~y andcb (parallel (declare-memory-timing data-cycle) (abus-array-data (assign a-temp-2 memory-data))) (assign b-temp-2 (dpb (b-constant -1) byte-s byte-r 0)) ;can’t merge this... (assign a-temp-2 (logxor a-temp-2 b-temp-2)) ;...with this. (parallel-with-return (store-word (logand a-temp-2 b-temp-3)))) ((6 9.) ;;6 x#y logxor ;;9 ~(x~y)=.x#y logxor (parallel-with-return (parallel (declare-memory-timing data-cycle) (abus-array-data (store-word (logxor b-temp-3 memory-data)))))) ((7 11.) ;;7 x+y logior ;;ll ~x+y logior (parallel-with-return (parallel (declare-memory-timing data-cycle) (abus-array-data (store-word (logior b-temp-3 memory-data)))))) ((13. 14.) ;;13 x+~y = ~(~x*y) lognand ;;14 ~x+~y=~(x*y) (parallel (declare-memory-timing data-cycle) (abus-array-data (assign a-temp-2 (logand b-temp-3 memory-data)))) (parallel-with-return (store-word (logxor (dpb (b-constant -1) byte-s byte-r 0) a-temp-2)))))) ;; vma has been set up already (defucode bb-word-alu-operation-dispatch ;commonly 3 cycles (plus 1 for the call) (dispatch-after-this (parallel (start-memory read) (ldb bb-alu-operation 4 0)) (waiting-for-memory) ;---want to use this somehow... ((1 2) ;;1 x*y logand ;;2 ~x*y logand (parallel (declare-memory-timing data-cycle) (abus-array-data (store-word (logand bb-s-word memory-data))) (return))) ((4 8.) ;;4 x-y andcb ;;8 ~(x+y) ~x*~y andcb 4,887,235 515 516 (parallel (declare-memory-timing data-cycle) (abus-array-data (store-word (andc2 bb-s-word memory-data))) (return))) ((6 9.) ;;6 x#y logxor ;;9 ~(x#y)=x#y logxor (parallel (declare-memory-timing data-cycle) (abus-array-data (store-word (logxor bb-s-word memory-data))) (return))) ((7 11.) ;;7 x+y logior ;;11 ~x+y logior (parallel (declare-memory-timing data-cycle) (abus-array-data (store-word (logior bb-m-word memory-data))) (return))) ((13. 14.) ;;13 x+-y - .(.xsy) ;;14 ~x+~y-.(x*y) (parallel (declare-memory-timing data-cycle) (abus-array-data (store-word (lognand bb-s-word memory-data))) (return))))) ;;alu depends only on source bits (defucode ubitblt-long-row-source (parallel (assign b-temp bb-d-bitpos) (if (zero-fixnum bb-d-bitpos) (if (zero-fixnum bb-s-bitpos) (goto ubitblt-aligned-row-source) ;; SSSSSSSSSSSSSSSSSSSSSSSSSsssssss ;; dddddddddddddddddddddddddddddddd (parallel-with-s-access bb-s-offset (assign byte-r (32- bb-s-bitpos)) (parallel (assign bb-s-word2 (logxor bb-constant (rotate memory-data byte-r))) (lisp (trace-path #/c)) (jump ubitblt-d-aligned-row-source)))) (if (equal-fixnum b-temp bb-s-bitpos) ;;SSSSSSSSSSSSSSSSSSSSSSSSSsssssss ;;DDDDDDDDDDDDDDDDDDDDDDDDDddddddd (sequential (parallel-with-s-access bb-s-offset (assign b-temp (32- bb-d-bitpos)) (assign byte-r b-temp) (assign bb-s-word (logxor bb-constant (rotate memory-data byte-r)))) (parallel-with-d-access bb-d-offset (assign byte-r bb-d-bitpos) (assign byte-s (1- b-temp)) (store-word (dpb bb-s-word byte-s byte-r memory-data))) ;; First partial word done, we are now the aligned ease (incr-wrap-s-offset) (incr-d-offset) (assign bb-width (- bb-width b-temp)) (assign bb-s-bitpos (b-constant 0)) (parallel (assign bb-d-bitpos (b-constant 0)) (lisp (trace-path #/b)) (jump ubitblt-aligned-row-source))) (if (lesser-fixnum bb-5-bitpos b-temp) ;;sssssssssSSSSSSSSSSSSSSSS....... ;; DDDDDDDDDDDDDODDdddddddddddddddd ;; <- 32-d.bitpos-> (sequential (parallel-with-s-access bb-s-offset (assign byte-r (32- bb-s-bitpos)) (assign b-temp (32- bb-d-bitpos)) (assign bb-s-word (logxor bb-constant (rotate memory-data byte-r)))) ;;.......sssssssssSSSSSSSSSSSSSSSS (parallel-with-d-access bb-d-offset (assign byte-r bb-d-bitpos) (assign byte-s (1- b-temp)) (store-word (dpb bb-s-word byte-s byte-r memory-data))) ;; First partial D word done, some S bits from first word remain (incr-d-offset) ;;rotate s-word further to right by 32-d.bitpos = left by -(32-d.bitpos) ;;SSSSSSSSSSSSSSS........sssssssss (assign bb-s-word2 (rotate bb-s-word byte-r)) (assign bb-s-bitpos (+ bb-s-bitpos b-temp)) (assign bb-width (- bb-width b-temp)) (parallel (assign bb-d-bitpos (b-constant 0))) (lisp (trace-path #/c)) (jump ubitblt-d-aligned-row-source)) (sequential ;;The hiah part of the first source word is not as long as the high part of the ;;first destination word. So extract the useful part of the first source word. ;;and deposit into it as much of the second source word as needed to fill out the rest ;;of the first destination word. Then position the rest of the second source word ;;appropriately for the inner loop. ;; <- 32-s -> ;; ................................|SSSSSSSSSSssssssssssssssssssssss ;; DDDDDDDDDDDDDDDD DDDDDDDDDDdddddd (parllel-with-s-access bb-s-offset 4,887,235 517 518 (assign byte-r (32- bb-s-bitpos)) (assign b-temp-2 bb-s-bitpos) (assign bb-s-word (logxor bb-constant (rotate memory-data byte-r)))) (incr-wrap-s-offset-ahead) ;; <----- s-d ----> <- 32-s -> (32-d)-(32-s)=s-d ;; ssssssssssssssssSSSSSSSSSSSSSSSS|1111111111..................... ;; DDDDDDDDDDDDDDDD DDDDDDDDDDdddddd (parallel-with-s-access bb-s-offset-ahead (assign byte-r (32- bb-s-bitpos)) (assign byte-s (- b-temp-2 bb-d-bitpos 1)) (assign bb-s-word2 (logxor bb-constant memory-data))) (assign bb-s-word (dpb bb-s-word2 byte-o byte-r bb-s-word)) (parallel (assign a-temp (32- bb-d-bitpos)) (assign b-temp obus)) (parallel-with-d-access bb-d-offset (assign byte-r bb-d-bitpos) (assign byte-s (1- a-temp)) (store-word (dpb bb-s-word byte-s byte-r memory-data))) ;; We have now done the first partial D word. Turn into the d-aligned ;; case, with the source advanced by one word from whero it started. (incr-d-offset) (assign bb-s-offset bb-s-offset-ahead) (assign bb-s-bitpos (- b-temp-2 bb-d-bitpos)) (assign byte-r (32- bb-s-bitpos)) (assign bb-s-word2 (rotate bb-s-word2 byte-r)) (assign bb-width (- bb-width b-temp)) (parallel (assign bb-d-bitpos (b-constant 0)) (lisp (trace-path #/e)) (jump ubitblt-d-aligned-row-source)))))))) (defucode ubitblt-aligned-row-source ;28 cycles per 8 words (if (greater-or-equal-fixnum bb-width (b-constant (* 8. 32.))) ;;Fetch a block of words into the buffer (sequential (assign b-temp (+ bb-s-offset (b-constant 8.))) (if (lesser-fixnum bb-s-row-length b-temp) (goto ubitblt-aligned-row-source-slow-loop) (sequential (parallel (assign-vma-offset s) (call ubitblt-block-read-8)) (parallel (assign-vma-offset d) (call ubitblt-block-write-8)) (parallel (assign bb-s-offset (+ bb-s-offset b-block-size)) (jump ubitblt-aligned-row-source))))) ;;Frob with whats left. Too bad dispatch blocks are expensive. (if (greater-or-equal-fixnum bb-width (b-constant (*4 32.))) (sequential (assign b-temp (+ bb-s-offset (b-constant 4))) (if (lesser-fixnum bb-s-row-length b-temp) (goto ubitblt-aligned-row-source-slow-loop) (sequential (parallel (assign-vma-offset s) (call ubitblt-block-read-4)) (parallel (assign-vma-offset d) (call ubitblt-block-write-4)) (parallel (assign bb-s-offset (+ bb-s-offset b-block-size)) (jump ubitblt-aligned-row-source-slow-loop))))) (goto ubitblt-aligned-row-source-slow-loop)))) (defucode ubitblt-aligned-row-source-slow-loop ;10 cycles per word (parallel-with-s-access bb-s-offset (trap-if (lesser-fixnum bb-width (b-constant 32.)) ubitblt-aligned-row-source-slow-loop-done) (waiting-for-memory) (assign bb-s-word (logxor bb-constant memory-data))) (assign-vma-offset d) ;1 (store-word bb-s-word) ;1 (assign bb-width (- bb-width (b-constant 32.))) ;1 (incr-wrap-s-offset) ;2 (parallel ;1 (incr-d-offset) (lisp (trace-path #/,)) (jump ubitblt-aligned-row-source-slow-loop))) ;Do last partial word, if any (defucode ubitblt-aligned-row-source-slow-loop-done (if (plus-fixnum bb-width) (sequential (parallel-with-s-access bb-s-offset (assign bb-s-word (logxor bb-constant memory-data))) (parallel-with-d-access bb-d-offset (assign byte-r (a-constant 0)) (assign byte-s (1- bb-width)) (parallel-with-return 4,887,235 519 520 (store-word (dpb bb-s-word byte-s byte-r memory-data)) (lisp (trace-path #/2))))) (parllel-with-return (lisp (trace-path #/1))))) ;bb-s-word2 has the pirtial previous source word whose cddress is in bb-e-offset, ;rotated into alignment with the destination (defucode ubitblt-d-aligned-row-source (if (greater-or-equal-fixnum bb-width (b-constant (* 8. 32.))) ;;Fetch a block of words into the buffer (sequential (assign b-temp (+ bb-s-offset (b-constant 8.))) (if (lesser-or-equal-fixnum bb-s-row-length b-temp) (goto ubitblt-d-aligned-row-source-slow-loop) (sequcntial (parallel (assign-vma-offset s 1) (call ubitblt-block-read-8)) (parallel (assign-vma-offset d) (call ubitblt-d-aligned-block-write-8)) (parallel (assign bb-s-offset (+ bb-s-offset b-block-size)) (jump ubitblt-d-aligned-row-source))))) (if (greater-or-equal-fixnum bb-width (b-constant (* 4. 32.))) (sequential (assign b-temp (+ bb-s-offset (b-constant 4))) (if (lesser-or-equal-fixnum bb-s-row-length b-temp) (goto ubitblt-d-aligned-row-source-slow-loop) (sequential (parallel (assign-vma-offset s 1) (call ubitblt-block-read-4)) (parallel (assign-vma-offset d) (call ubitblt-d-aligned-block-write-4)) (parallel (assign bb-s-offset (+ bb-s-offset b-block-size)) (jump ubitblt-d-aligned-row-source))))) (goto ubitblt-d-aligned-row-source-slow-loop)))) ;;Each pass through this loop stores exactly one d word. Each time through, ;;bb-s-word2 will have the bits to use for the lower part of the d word (already ;;rotated into position), and another s word will be fetched into bb-s-word. ;;Then s-word will get rotated when transferred into s-word2 in preparation for ;;next loop pass. (defucode ubitblt-d-aligned-row-source-slow-loop ;13 cycles per word (incr-wrap-s-offset-ahead) ;2 (parallel-with-s-access bb-s-offset-ahead ;4 (trap-if (lesser-fixnum bb-width (b-constant 32.)) ubitblt-d-aligned-row-source-done) (assign byte-s (1- bb-s-bitpos)) (assign bb-s-word (logxor bb-constant memory-data))) (assign byte-r (- (b-constant 32.) bb-s-bitpos)) ;1 (assign-vma-offset d) ;1 (store-word (dpb bb-s-word byte-s byte-r bb-s-word2)) ;1 (assign bb-width (- bb-width (b-constant 32.))) ;1 (incr-d-offset) ;1 (assign bb-s-offset bb-s-offset-ahead) (parallel ;1 (assign bb-s-word2 (rotate bb-s-word byte-r)) (lisp (trace-path #/.)) (jump ubitblt-d-aligned-row-source))) (defucode ubitblt-d-aligned-row-source-done (if (plus-fixnum bb-width) (sequential (assign b-temp (32- bb-s-bitpos)) ;how many bits are valid in bb-s-word2 (if (lesser-or-equal-fixnum bb-width b-temp) ;;we have enough s bits (parallel-with-d-access bb-d-offset (assign byte-s (1- bb-width)) (parallel (assign byte-r (b-constant 0)) (assign bb-s-word bb-s-word2)) (parallel (lisp Itrace-path #/4)) (parallel-with-return (store-word (dpb bb-s-word byte-s byte-r memory-data))))) ;;need to get another source word (sequential (parallel-with-s-access bb-s-offset-ahead (assign byte-r (32- bb-s-bitpos)) (assign byte-s (1- bb-s-bitpos)) (assign bb-s-word (logxor bb-constant memory-data))) (assign bb-s-word (dpb bb-s-word byte-s byte-r bb-s-word2)) (lisp trace-path #/5)) (parallel-with-d-access bb-d-offset (assign byte-s (1- bb-width)) (assign byte-r (a-constant 0)) (parallel-with-return (store-word (dpb bb-s-word byte-s byte-r memory-data))))))) 4,887,235 521 522 ;;XXXbrad - this appears not to match previous page! (parallel (lisp (trace-path #/3)) (return)))) ;;alu depends only on destination bits (defucode ubitblt-long-row-destination (if (plus-fixnum bb-d-bitpos) ;frob the first partial word (sequential (assign b-temp (32- bb-d-bitpos)) (parallel-with-d-access bb-d-offset (assign byte-s (1- b-temp)) (assign byte-r bb-d-bitpos) (store-word (logxor (dpb bb-constant byte-s byte-r 0) memory-data))) (incr-d-offset) (assign bb-width (- bb-width b-temp)) (parallel (assign bb-d-bitpos (b-constant 0)) (lisp (trace-path #/b)) (jump ubitblt-long-row-destination-loop))) (machine-version-case ((sim) (parallel (lisp (trace-path #/a)) (jump ubitblt-long-row-destination-loop))) (otherwise (goto ubitblt-long-row-destination-loop))))) (defucode ubitblt-long-row-destination-loop ;25 cycles per 8 words (if (greater-or-equal-fixnum bb-width (b-constant (* 8. 32.))) ;;Fetch a block of words into the buffer (sequential (parallel (assign-vma-offset d) (call ubitblt-block-read-8)) (parallel (assign-vma-offset d) (call-and-return-to ubitblt-block-write-8 ubitblt-long-row-destination-loop))) ;;Frob with what’s left. Too bad dispatch blocks are expensive. (if (greater-or-equal-fixnum bb-width (b-constant (* 4 32.))) (sequential (parallel (assign-vma-offset d) (call ubitblt-block-read-4)) (parallel (assign-vma-offset d) (call-and-return-to ubitblt-block-write-4 ubitblt-long-row-destination-slow-loop))) (goto ubitblt-long-row-destination-slow-loop)))) (defucode ubitblt-long-row-destination-slow-loop ;5 cycles per word (bus interference) (parallel-with-d-access-check-write bb-d-offset (parallel (assign bb-width (- bb-width (b-constant 32.))) (trap-if (minus-fixnum obus) ubitblt-long-row-destination-done)) ;aborts the assign (parallel (lisp (trace-path #/,)) (waiting-for-memory) (incr-d-offset)) (parallel (store-word (logxor bb-constant memory-data)) (jump ubitblt-long-row-destination-slow-loop)))) (defucode ubitblt-long-row-destination-done (if (plus-fixnum bb-width) (parallel-with-d-access bb-d-offset (assign byte-s (1- bb-width)) (assign byte-r (a-constant 0)) (parallel-with-return (lisp (trace-path #/2)) (store-word (logxor (dpb bb-constant byte-s byte-r 0) memory-data)))) (parallel (lisp (trace-path #/1)) (return)))) (defmacro def-bitblt-block-read (name n) `(defucode ,name (parallel (assign a-block-size (b-constant ,n)) ;Used later to advance offsets (assign b-block-size obus) (start-memory block read)) ;start first word (parallel (waiting-for-memory) ;waiting for first word (start-memory block read)) ;start second word ,@(loop for i from (- n-bitblt-buffers n) below n-bitblt-buffers collect `(abus-array-data (assign (bitblt-buffer ,i) (set-type (logxor bb-constant memory-data) dtp-fix)) ,(selectq (- n-bitblt-buffers i) (1 `(return)) (2 nil) (otherwise `(start-memory block read))))))) 4,887,235 523 524 (def-bitblt-block-read ubitblt-block-read-8 8) ;I suppose this when interned... (def-bitblt-block-read ubitblt-block-read-4 4) ;... will subsume this. (defmacro def-bitblt-block-write (name n) `(defucode ,name (activate-bitblt-buffer) ,@(loop for i from (- n-bitblt-buffers n) below n-bitblt-buffers collect `(parallel (store-word (bitblt-buffer ,i) block) (lisp (trace-path #/.)))) (parallel (assign bb-d-offset (+ bb-d-offset b-block-size)) (call deactivate-bitblt-buffer)) (parallel-with-return (assign bb-width (- bb-width (rotate b-block-size 5))) ;2~5 = bits-per-word ))) (def-bitblt-block-write ubitblt-block-write-8 8) (def-bitblt-block-write ubitblt-block-write-4 4) (defmacro def-d-aligned-block-write (name n) `(defucode ,name (assign byte-s (- bb-s-bitpos)) (parallel (assign byte-r (- (b-constant 32.) bb-s-bitpos)) (call activate-bitblt-buffer)) ,@(loop for i from (- n-bitblt-buffers n) below n-bitblt-buffers append `((parallel (store-word (dpb (bitblt-buffer ,i) byte-s byte-r bb-s-word2) block) (lisp (trace-path #/.))) (assign bb-s-word2 (rotate (bitblt-buffer ,i) byte-r)))) (parallel (assign bb-d-offset (+ bb-d-offset b-block-size)) (call deactivation-bitblt-buffer)) (parallel-with-return (assign bb-width (- bb-width (rotate b-block-size 5))) ;2^5 = bits-per-word ))) (def-d-aligned-block-write ubitblt-d-aligned-block-write-8 8.) (def-d-aligned-block-write ubitblt-d-aligned-block-write-4 4.) ;;alu depends on neither source nor destination bits (defucode ubitblt-long-row-neither (if (plus-fixnum bb-d-bitpos) (sequential (assign b-temp (32- bb-d-bitpos)) (parallel-with-d-access bb-d-offset (assign byte-r bb-d-bitpos) (assign byte-s (2- b-temp)) (store-word (dpb bb-constant byte-s byte-r memory-data))) (incr-d-offset) (assign bb-width (- bb-width b-temp)) (parallel (assign bb-d-bitpos (b-constant 0)) (lisp (trace-path #/b)) (jump ubitblt-long-row-neither-loop))) (parallel (lisp (trace-path #/a)) (jump ubitblt-long-row-neither-loop)))) (defucode ubitblt-long-row-neither-loop (if (greater-or-equal-fixnum bb-width (b-constant (* 8. 32.))) (sequential (parallel (assign-vma-offset d) (call store-block-bb-constant-8)) (assign bb-d-offset (+ bb-d-offset (b-constant 8.))) (parallel (assign bb-width (- bb-width (b-constant (* 8. 32.)))) (jump ubitblt-long-row-neither-loop))) (sequential (dispatch-after-next (parallel (assign b-block-size (ldb bb-width 3 5)) (ldb bb-width 3 5)) ((7) (parallel (assign-vma-offset d) (call-and-return-to store-block-bb-constant-7 ubitblt-long-row-neither-finish))) ((6) (parallel (assign-vma-offset d) (call-and-return-to store-block-bb-constant-6 ubitblt-long-row-neither-finish))) ((5) (parallel (assign-vma-offset d) (call-and-return-to store-block-bb-constant-5 ubitblt-long-row-neither-finish))) ((4) (parallel (assign-vma-offset d) (call-and-return-to-store-block-bb-constant-4 ubitblt-long-row-neither-finish))) ((3) (parallel (assign-vma-offset d) (call-and-return-to store-block-bb-constant-3 ubitblt-long-row-neither-finish))) ((2) (parallel (assign-vma-offset d) (call-and-return-to store-block-bb-constant-2 ubitblt-long-row-neither-finish))) ((1) (assign-vma-offset d) 4,887,235 525 526 (parallel (lisp (trace-path #/.)) (store-word bb-constant) (jump ubitblt-long-row-neither-finish)))) (parallel (take-dispatch) (trap-if (zero-fixnum b-block-size) ubitblt-long-row-neither-finish))))) (defucode ubitblt-long-row-neither-finish (assign bb-d-offset (+ bb-d-offset b-block-size)) (assign bb-width (logand bb-width (b-constant #o37))) (if (plus-fixnum bb-width) (parallel-with-d-access bb-d-offset (assign byte-r (a-constant 0)) (assign byte-s (1- bb-width)) (parallel (lisp (trace-path #/2)) (store-word (dpb bb-constant byte-s byte-r memory-data)) (return))) (parallel (lisp (trace-path #/1)) (return)))) (defmacro store-block-bb-constant-routines (n) `(progn 'compile ,@(loop with s = "STORE-BLOCK-BB-CONSTANT-~d" for i from n downto 1 collect `(defucode ,(fintern s i) (parallel (store-word bb-constant block) (lisp (trace-path #/,)) ,(if (> i 1) `(jump ,(fintern s (1- i))) `(return))))))) (store-block-bb-constant-routines 8.) ;;alu depends both source and destination bits (defucode ubitblt-long-row-both (parallel (assign b-temp bb-d-bitpos) (if (zero-fixnum bb-d-bitpos) (if (zero-fixnum bb-s-bitpos) (goto ubitblt-aligned-row-both) (parallel-with-s-access bb-s-offset ;; SSSSSSSSSSSSSSSSSSSSSSSSSSS.ssss ;;ddddddddddddddddddddddddddddddd. (assign byte-r (32- bb-s-bitpos)) (parallel (assign bb-s-word (rotate memory-data byte-r)) (lisp (trace-path #/c)) (jump ubitblt-aligned-row-both)))) (if (equal-fixnum bb-s-bitpos b-temp) (sequential (parallel-with-s-access bb-s-offset ;;SSSSSSSSSSSSSSSSSSSSSSSSSS.ssssss ;;dddddddddddddddddddddddddd.dddddd (parallel (assign byte-r (32- bb-s-bitpos)) (assign b-temp obus)) (assign byte-s (31- bb-s-titpos)) (assign bb-s-word (logxor bb-constant (ldb memory-data byte-s byte-r)))) (assign byte-r bb-s-bitpos) (parallel (assign-vma-offset d) ;;ssssssssssssssssssssssssss.ssssss ;;DDDDDDDDDDDDDDDDDDDDDDDDDD.dddddd (call bb-byte-alu-operation-dispatch)) ;; First partial word stored, turn into aligned case (incr-wrap-s-offset) (incr-d-offset) (assign bb-width (- bb-width b-temp)) (assign bb-s-bitpos (b-constant 0)) (parallel (assign bb-d-bitpos (b-constant 0)) (lisp (trace-path #/b)) (jump ubitblt-aligned-row-both))) (if (lesser-fixnum bb-s-bitpos b-temp) (goto ubitblt-long-row-both-s-longer) (goto ubitblt-long-row-both-s-shorter)))))) (defucode ubitblt-long-row-both-s-longer (assign b-temp (32- bt-d-bitpos)) (parallel-with-s-access bb-s-offset (assign byte-r (32- bb-s-bitpos)) (assign byte-s (1- b-temp)) (assign bb-s-word2 memory-data)) ;;ssssSSSSSSSSSSSSSSSSSSSS........ ;; DDDDDDDDDDDDDDDDDDDDdddddddddddd ;; <----- b-temp -----> (assign bb-s-word (logxor bb-constant (rotate bb-s-word2 byte-r))) ;;........ssssSSSSSSSSSSSSSSSSSSSS (parallel 4,887,235 527 528 (assign byte-r bb-d-bitpos) (assign b-temp-2 bb-d-bitpos)) (parallel (assign-vma-offset d) ;;ssssssssssssssssssssssss.ssssssss ;; DDDDDDDDDDDDDDDDDDDD.dddddddddddd (call bb-byte-alu-operation-dispatch)) (incr-d-offset) ;;Remaining are (32-(s.bitpos+(32-d.bitpos))) = d.bitpos-s.bitpos ;; <-- 32-d.bitpos ---> <-s.bitpos-> ;;SSSSssssssssssssssssssss.ssssssss ;; dddddddddddddddddddd.dddddddddddd (assign byte-r (- b-temp-2 bb-s-bitpos)) (assign bb-s-bitpos (+ bb-s-bitpos b-temp)) (assign bb-s-word (rotate bb-s-word2 byte-r)) (assign bb-width (- bb-width t-temp)) (parallel (assign bb-d-bitpos (b-constant 0)) (lisp (trace-path #/d)) (jump ubitblt-d-aligned-row-both))) ;Need two S words to do the first partial D word (defucode ubitblt-long-row-both-s-shorter ;; ssssssssssssssssssssssss.ssssssss ;;dddddddddddddddddddddddddddd.dddd (parallel-with-s-access bb-s-offset (assign byte-r (32- bb-s-bitpos)) (assign byte-s (31- bb-s-bitpos)) ;; SSSSSSSSSSSSSSSSSSSSSSSS.ssssssss ;;ddddddaddddddddddddddddddddd.dddd (assign bb-s-word (logxor bb-constant (ldb memory-data byte-s byte-r)))) (incr-wrap-s-offset-ahead) ;; <--> s.bitpos-d.bitpos ;;...SSSS|sssssssssssmssssssssssss.ssssssss ;; dddd dddddddddddddddddddddddd.dddd (parallel-with-s-access bb-s-offset-ahead (assign byte-s (- bb-s-bitpos b-temp 1)) (assign byte-r (32- bb-s-bitpos)) (assign bb-s-word2 (logxor bb-constant memory-data))) ;;...SSSS|SSSSSSSSSSSSSSSSSSSSSSSS.ssssssss ;; dddd dddddddddddddddddddddddd.dddd (assign bb-s-word (dpb bb-s-word2 byte-s byte-r bb-s-word)) (assign byte-r bb-d-bitpos) ;;...ssss|ssssssssssssssssssssssss.ssssssss ;; DDDD DDDDDDDDDDDDDDDDDDDDDDDD.dddd (parallel (assign-vma-offset d) (call bb-byte-alu-operation-dispatch)) (incr-d-offset) (assign bb-s-offset bb-s-offset-ahead) ;;...SSSssss|ssssssssssssssssssssssss.ssssssss ;; dddd dddddddddddddddddddddddd.dddd (assign byte-r (- b-temp bb-s-bitpos)) (assign bb-s-bitpos (- bb-s-bitpos b-temp)) (assign b-temp (32- bb-d-bitpos)) (assign bb-s-word (logxor (rotate bb-s-word2 byte-r) bb-constant)) (assign bb-width (- bb-width b-temp)) (parallel (assign bb-d-bitpos (b-constant 0)) (lisp (trace-path #/e)) (jump ubitblt-d-aligned-row-both))) (defucode ubitblt-aligned-row-both (if (greater-or-equal-fixnum bb-width (b-constant (* 8. 32.))) ;;Fetch a block of words into the buffer (sequential (assign b-temp (+ bb-s-offset (b-constant 8.))) (if (lesser-fixnum bb-s-row-length b-temp) (goto ubitblt-aligned-row-both-slow-loop) (sequential (parallel (assign-vma-offset s) (call ubitblt-block-read-8)) (parallel (assign-vma-offset d) (call-and-return-to ubitblt-block-alu-8 ubitblt-aligned-row-both))))) ;; Frob with what’s left. Too bad dispatch blocks are expensive. (if (greater-or-equal-fixnum bb-width (b-constant (* 4 32.))) (sequential (assign b-temp (+ bb-s-offset (b-constant 4.))) (if (lesser-fixnum bb-s-row-length b-temp) (goto ubitblt-aligned-row-both-slow-loop) (sequential (parallel (assign-vma-offset c) (call ubitblt-block-read-4)) (parallel (assign-vma-offset d) (call-and-return-to ubitblt-block-alu-4 ubitblt-aligned-row-both-slow-loop))))) (goto ubitblt-aligned-row-both-slow-loop)))) 4,887,235 529 530 (defucode ubitblt-aligned-row-both-slow-loop ;12 cycles per word (parallel-with-s-access bb-s-offset ;4 cycles (trap-if (lesser-fixnum bb-width (b-constant 32.)) ubitblt-aligned-row-both-slow-loop-done) (waiting-for-memory) (assign bb-s-word (logxor bb-constant memory-data))) (parallel ;1+3 cycles (assign-vma-offset d) (call bb-word-alu-operation-dispatch)) (assign bb-width (- bb-width (b-constant 32.))) ;1 cycle (incr-wrap-s-offset) ;2 cycles (parallel (incr-d-offset) ;1 cycle (lisp (trace-path #/.)) (jump ubitblt-aligned-row-both))) (defucode ubitblt-aligned-row-both-slow-loop-done (if (plus-fixnum bb-width) (sequential (parallel-with-s-access bb-s-offset (assign byte-r (b-constant 0)) (assign byte-s (1- bb-width)) (assign bb-s-word (logxor bb-constant memory-data))) (parallel (lisp (trace-path #/2)) (assign-vma-offset d) (jump bb-byte-alu-operation-dispatch))) ;jcall (parallel-with-return (lisp (trace-path #/1))))) (defucode ubitblt-block-alu-8 (dispatch-after-this (ldb bb-alu-operation 4 0) (parallel (assign a-block-size (a-constant 8.)) (assign b-block-size (a-constant 0.)) (start-memory block read)) ;start first word ((1 2) (goto ubitblt-block-logand-8)) ; x*y ~x*y ((4 8.) (goto ubitblt-block-andc2-8)) ; x*~y ~x*~y ((6 9.) (goto ubitblt-block-logxor-8)) ; x xor y, ~x xor y ((7 11.) (goto ubitblt-block-logior-8)) ; x+y, ~x+y ((13. 14.) (goto ubitblt-block-lognand-8)))) ; ~(~x*y), ~(x*y) (defucode ubitblt-block-alu-4 (dispatch-after-this (ldb bb-alu-operation 4 0) (parallel (assign a-block-size (a-constant 4.)) (assign b-block-size (a-constant 4.)) (start-memory block read)) ;start first word ((1 2) (goto ubitblt-block-logand-4)) ; x*y ~x*y ((4 8.) (goto ubitblt-block-andc2-4)) ; x*~y ~x*~y ((6 3.) (octo ubitblt-block-logxor-4)) ; x xor y, ~x xor y ((7 11.) igoto ubitbit-block-logior-4)) ; x+y, ~x+y ((13. 14.) (goto ubitblt-block-lognand-4)))) ; ~(~x*y), ~(x*y) (defmacro def-block-aluop (name n alu) (if (memq (get (caddr (microexpand `(,alu a-temp b-temp))) 'alu) weird-alu-functions) ;; Cannot simultaneously run ALU and storm into tne bitblt-buffer `(defucode ,name (parallel (waiting-for-memory) ;first word already started (declare-memory-timing active-cycle)) ,@(loop for i from (- n-bitblt-buffers n) below n-bitblt-buffers collect `(sequential (abus-array-data (assign b-temp (,alu (bitblt-buffer ,i) memory-data)) `(if (> (- n-bitblt-buffers i) 1) `(start-memory block read))) ;start next word (parallel (assign (bitblt-buffer ,i) (set-type b-temp dtp-fix)) ,(if (= (- n-bitblt-buffers i) 1) `(jump (fintern "UBITBLT-BLOCK-ALU-WRITE-~d" n))))))) ;; Normal case `(defucode ,name (parallel (waiting-for-memory) ;first word alread started (declare-memory-timing active-cycle) (start-memory read block)) ;start second word ,@e(loop for i from (- n-bitblt-buffers n) below n-bitblt-buffers collect `(parallel (abus-array-data (assign (bitbit-buffer ,1) (set-type (,alu (bitblt-buffer ,i) memory-data) dtp-fix))) ,(selectq (- n-bitblt-buffers i) (1 `(jump ,(fintern "UBITBLT-BLOCK-ALU-WRITE-~d" n))) (2 nil) (otherwise `(start-memory block read))) ;start word after next ))))) (def-block-aluop ubitblt-block-logand-8 8 logand) (def-block-aluop ubitblt-block-logior-8 8 logior) 4,887,235 531 532 (def-block-aluop ubitblt-block-logxor-8 8 logxor) (def-block-aluop ubitblt-block-andc2-8 8 andc2) (def-block-aluop ubitblt-block-lognand-8 8 lognand) (def-block-aluop ubitblt-block-logand-4 4 logand) (def-block-aluop ubitblt-block-loglor-4 4 logior) (def-block-aluop ubitblt-block-logxor-4 4 logxor) (def-block-aluop ubitblt-block-andc2-4 4 andc2) (def-block-aluop ubitblt-blocK-lognand-4 4 lognand) (defmacro def-block-alu-write (name n) `(defucode ,name (parallel (assign-vma-offset d) (call activate-bitblt-buffer)) ,@(loop for i from (- n-bitblt-buffers n) below n-bitblt-buffers collect `(parallel (store-word (bitblt-buffer ,i) block) (lisp (trace-path #/,)))) (parallel (assign bb-d-offset (+ bb-d-offset b-block-size)) (call dectivate-bitblt-buffer)) (assign bb-width (- bb-width (rotate b-block-size 5))) ;2^5 = bits-per-word (parallel (assign bb-s-offset (+ bb-s-offset b-block-size)) (return)))) (def-block-alu-write ubitblt-block-alu-write-8 8) (def-block-alu-write ubitblt-block-alu-write-4 4) ;;Each time through the loop, s-word was fetched from memory like ;; <----- s.bitpos -----> ;;ssssssssss...................... ;;and then rotated so it looks like ;;......................ssssssssss ;;<----- s.bitpos -----> ;; ;:Each time, another s-word2 gets fetched and deposited into s-word like ;; |<----- s.bitpos -----> ;; |......................1111111111 ;;2222222222 2222222222222222222222 ;; ;;The rotation for the dpb equals the rotation for setup for next loop. ;;bb-s-word has the partial previous source word whose address is in bb-s-offset, ;;rotated into alignment with the destination, but not xored with bb-constant (defucode ubitblt-d-aligned-row-both (if (greater-or-equal-fixnum bb-width (b-constant (* 8. 32.))) ;;Fetch a block of words into the buffer (sequential (assign b-temp (+ bb-s-offset (b-constant 8.))) (if (lesser-or-equal-fixnum bb-s-row-length b-temp) (goto ubitblt-d-aligned-row-both-slow-loop) (sequential (parallel (assign-vma-offset s 1) (call ubitblt-rotated-block-read-8)) (parallel (assign-vma-offset d) (call-and-return-to ubitblt-block-alu-8 ubitblt-d-aligned-row-both))))) ;;Frob with what’s left. Too bad dispatch blocks are expensive. (if (greater-or-equal-fixnum bb-width (b-constant (* 4 32.))) (sequential (assign b-temp (+ bb-s-offset (b-constant 4.))) (if (lesser-or-equal-fixnum bb-s-row-length t-temp) (goto ubitblt-d-aligned-row-both-slow-loop) (sequential (parallel (assign-vma-offset s 1) (call ubitblt-rotated-block-read-4)) (parallel (assign-vma-offset d) (call-and-return-to ubitblt-block-alu-4 ubitblt-d-aligned-row-both-slow-loop))))) (goto ubitblt-d-aligned-row-both-slow-loop)))) (defucode ubitblt-d-aligned-row-both-slow-loop ;17 cycles per word (incr-wrap-s-offset-ahead) ;2 (parallel-with-s-access bb-s-offset-ahead (trap-if (lesser-fixnum bb-width (b-constant 32.)) ubitblt-d-aligned-row-both-done) (assign byte-s (1- bb-s-bitpos)) (assign bb-s-word2 memory-data)) (assign byte-r (32- bb-s-bitpos)) (assign bb-s-word (dpb bb-s-word2 byte-s byte-r bb-s-word)) ;1 (assign bb-s-word (logxor bb-constant-a bb-s-word)) ;1 (parallel ;1+3 (assign-vma-offset d)