*BSD News Article 6818


Return to BSD News archive

Newsgroups: comp.unix.bsd
Path: sserve!manuel.anu.edu.au!munnari.oz.au!uunet!pmafire!news.dell.com!swrinde!gatech!hubcap!ncrcae!ncr-sd!crash!fpm
From: fpm@crash.cts.com (Frank Maclachlan)
Subject: Functions are not word-aligned in 386BSD 0.1 locore.s (w/ patch)
Organization: CTS Network Services (crash, ctsnet), El Cajon, CA
Date: 20 Oct 92 13:32:56 PDT
Message-ID: <1992Oct20.133257.29726@crash>
Keywords: patch 386BSD
Lines: 738

I've wanted to post this for a long time, but resisted since it doesn't
*fix* anything; it simply makes certain kernel routines a tiny bit
faster.

I noticed that branch/function call addresses and some data items in
'/sys/i386/i386/locore.s' and '/sys/i386/isa/icu.s' are not aligned on
32 bit word boundaries (0 modulo 4).  This causes a small performance
hit on 386DX/486 based machines.  The 486, in fact, performs best when
branch addresses are aligned on 16 byte boundaries (0 modulo 16) due to
its burst cache fill capability; this is one of the reasons why the gcc
2.2.2 -486 option often results in larger binaries.

Anyway, I modified '/sys/i386/i386/locore.s' and '/sys/i386/isa/icu.s'
to align things on 4 byte boundaries.  I used a macro, ALIGN32, to
perform the alignment since the new version of the Gnu assembler will
probably change the meaning of .align 2.  I also eliminated an unnec-
essary instruction in ___divsi3.  I don't claim a profound performance
boost, but every little bit helps!

To apply the patch, change directory to /sys and type the following:

	patch -p0 <patch_file_name

Here's the patch:

diff -rc i386.ORIG/i386/locore.s i386/i386/locore.s
*** i386.ORIG/i386/locore.s	Wed Aug 12 21:03:05 1992
--- i386/i386/locore.s	Tue Oct 20 12:56:10 1992
***************
*** 63,68 ****
--- 63,69 ----
  	.set	SYSPDROFF,0x3F8		# Page dir index of System Base
  
  #define	NOP	inb $0x84, %al ; inb $0x84, %al 
+ #define	ALIGN32	.align 2	/* 2^2  = 4 */
  
  /*
   * PTmap is recursive pagemap at top of virtual address space.
***************
*** 372,377 ****
--- 373,379 ----
  	 * Support routines for GCC
  	 */
  	.globl ___udivsi3
+ 	ALIGN32
  ___udivsi3:
  	movl 4(%esp),%eax
  	xorl %edx,%edx
***************
*** 379,387 ****
  	ret
  
  	.globl ___divsi3
  ___divsi3:
  	movl 4(%esp),%eax
! 	xorl %edx,%edx
  	cltd
  	idivl 8(%esp)
  	ret
--- 381,390 ----
  	ret
  
  	.globl ___divsi3
+ 	ALIGN32
  ___divsi3:
  	movl 4(%esp),%eax
! 	#xorl %edx,%edx		/* not needed - cltd sign extends into %edx */
  	cltd
  	idivl 8(%esp)
  	ret
***************
*** 390,395 ****
--- 393,399 ----
  	 * I/O bus instructions via C
  	 */
  	.globl	_inb
+ 	ALIGN32
  _inb:	movl	4(%esp),%edx
  	subl	%eax,%eax	# clr eax
  	NOP
***************
*** 398,403 ****
--- 402,408 ----
  
  
  	.globl	_inw
+ 	ALIGN32
  _inw:	movl	4(%esp),%edx
  	subl	%eax,%eax	# clr eax
  	NOP
***************
*** 406,411 ****
--- 411,417 ----
  
  
  	.globl	_rtcin
+ 	ALIGN32
  _rtcin:	movl	4(%esp),%eax
  	outb	%al,$0x70
  	subl	%eax,%eax	# clr eax
***************
*** 413,418 ****
--- 419,425 ----
  	ret
  
  	.globl	_outb
+ 	ALIGN32
  _outb:	movl	4(%esp),%edx
  	NOP
  	movl	8(%esp),%eax
***************
*** 421,426 ****
--- 428,434 ----
  	ret
  
  	.globl	_outw
+ 	ALIGN32
  _outw:	movl	4(%esp),%edx
  	NOP
  	movl	8(%esp),%eax
***************
*** 433,438 ****
--- 441,447 ----
  	 */
  
  	.globl _bzero
+ 	ALIGN32
  _bzero:
  	pushl	%edi
  	movl	8(%esp),%edi
***************
*** 454,459 ****
--- 463,469 ----
  	 */
  
  	.globl _fillw
+ 	ALIGN32
  _fillw:
  	pushl	%edi
  	movl	8(%esp),%eax
***************
*** 466,471 ****
--- 476,482 ----
  	ret
  
  	.globl _bcopyb
+ 	ALIGN32
  _bcopyb:
  	pushl	%esi
  	pushl	%edi
***************
*** 486,491 ****
--- 497,503 ----
  	 */
  
  	.globl	_bcopy,_ovbcopy
+ 	ALIGN32
  _ovbcopy:
  _bcopy:
  	pushl	%esi
***************
*** 507,512 ****
--- 519,525 ----
  	popl	%esi
  	xorl	%eax,%eax
  	ret
+ 	ALIGN32
  1:
  	addl	%ecx,%edi	/* copy backwards. */
  	addl	%ecx,%esi
***************
*** 530,535 ****
--- 543,549 ----
  
  #ifdef notdef
  	.globl	_copyout
+ 	ALIGN32
  _copyout:
  	movl	_curpcb, %eax
  	movl	$cpyflt, PCB_ONFAULT(%eax) # in case we page/protection violate
***************
*** 596,601 ****
--- 610,616 ----
  	ret
  
  	.globl	_copyin
+ 	ALIGN32
  _copyin:
  	movl	_curpcb,%eax
  	movl	$cpyflt,PCB_ONFAULT(%eax) # in case we page/protection violate
***************
*** 621,626 ****
--- 636,642 ----
  	movl	%eax,PCB_ONFAULT(%edx)
  	ret
  
+ 	ALIGN32
  cpyflt:
  	popl	%ebx
  	popl	%edi
***************
*** 631,636 ****
--- 647,653 ----
  	ret
  #else
  	.globl	_copyout
+ 	ALIGN32
  _copyout:
  	movl	_curpcb,%eax
  	movl	$cpyflt,PCB_ONFAULT(%eax) # in case we page/protection violate
***************
*** 655,660 ****
--- 672,678 ----
  	ret
  
  	.globl	_copyin
+ 	ALIGN32
  _copyin:
  	movl	_curpcb,%eax
  	movl	$cpyflt,PCB_ONFAULT(%eax) # in case we page/protection violate
***************
*** 678,683 ****
--- 696,702 ----
  	movl	%eax,PCB_ONFAULT(%edx)
  	ret
  
+ 	ALIGN32
  cpyflt: popl	%edi
  	popl	%esi
  	movl	_curpcb,%edx
***************
*** 689,694 ****
--- 708,714 ----
  
  	# insb(port,addr,cnt)
  	.globl	_insb
+ 	ALIGN32
  _insb:
  	pushl	%edi
  	movw	8(%esp),%dx
***************
*** 705,710 ****
--- 725,731 ----
  
  	# insw(port,addr,cnt)
  	.globl	_insw
+ 	ALIGN32
  _insw:
  	pushl	%edi
  	movw	8(%esp),%dx
***************
*** 720,725 ****
--- 741,747 ----
  
  	# outsw(port,addr,cnt)
  	.globl	_outsw
+ 	ALIGN32
  _outsw:
  	pushl	%esi
  	movw	8(%esp),%dx
***************
*** 735,740 ****
--- 757,763 ----
  
  	# outsb(port,addr,cnt)
  	.globl	_outsb
+ 	ALIGN32
  _outsb:
  	pushl	%esi
  	movw	8(%esp),%dx
***************
*** 753,758 ****
--- 776,782 ----
  	 * void lgdt(struct region_descriptor *rdp);
  	 */
  	.globl	_lgdt
+ 	ALIGN32
  _lgdt:
  	/* reload the descriptor table */
  	movl	4(%esp),%eax
***************
*** 779,784 ****
--- 803,809 ----
  	 * void lidt(struct region_descriptor *rdp);
  	 */
  	.globl	_lidt
+ 	ALIGN32
  _lidt:
  	movl	4(%esp),%eax
  	lidt	(%eax)
***************
*** 788,793 ****
--- 813,819 ----
  	 * void lldt(u_short sel)
  	 */
  	.globl	_lldt
+ 	ALIGN32
  _lldt:
  	lldt	4(%esp)
  	ret
***************
*** 796,801 ****
--- 822,828 ----
  	 * void ltr(u_short sel)
  	 */
  	.globl	_ltr
+ 	ALIGN32
  _ltr:
  	ltr	4(%esp)
  	ret
***************
*** 805,810 ****
--- 832,838 ----
  	 */
  	.globl	_lcr3
  	.globl	_load_cr3
+ 	ALIGN32
  _load_cr3:
  _lcr3:
  	inb	$0x84,%al	# check wristwatch
***************
*** 816,821 ****
--- 844,850 ----
  
  	# tlbflush()
  	.globl	_tlbflush
+ 	ALIGN32
  _tlbflush:
  	inb	$0x84,%al	# check wristwatch
  	movl	%cr3,%eax
***************
*** 826,831 ****
--- 855,861 ----
  
  	# lcr0(cr0)
  	.globl	_lcr0,_load_cr0
+ 	ALIGN32
  _lcr0:
  _load_cr0:
  	movl	4(%esp),%eax
***************
*** 834,839 ****
--- 864,870 ----
  
  	# rcr0()
  	.globl	_rcr0
+ 	ALIGN32
  _rcr0:
  	movl	%cr0,%eax
  	ret
***************
*** 840,845 ****
--- 871,877 ----
  
  	# rcr2()
  	.globl	_rcr2
+ 	ALIGN32
  _rcr2:
  	movl	%cr2,%eax
  	ret
***************
*** 847,852 ****
--- 879,885 ----
  	# rcr3()
  	.globl	_rcr3
  	.globl	__cr3
+ 	ALIGN32
  __cr3:
  _rcr3:
  	movl	%cr3,%eax
***************
*** 854,859 ****
--- 887,893 ----
  
  	# ssdtosd(*ssdp,*sdp)
  	.globl	_ssdtosd
+ 	ALIGN32
  _ssdtosd:
  	pushl	%ebx
  	movl	8(%esp),%ecx
***************
*** 877,882 ****
--- 911,917 ----
  /*
   * {fu,su},{byte,word}
   */
+ 	ALIGN32
  ALTENTRY(fuiword)
  ENTRY(fuword)
  	movl	_curpcb,%ecx
***************
*** 887,892 ****
--- 922,928 ----
  	movl	$0,PCB_ONFAULT(%ecx)
  	ret
  	
+ 	ALIGN32
  ENTRY(fusword)
  	movl	_curpcb,%ecx
  	movl	$fusufault,PCB_ONFAULT(%ecx) #in case we page/protection violate
***************
*** 896,901 ****
--- 932,938 ----
  	movl	$0,PCB_ONFAULT(%ecx)
  	ret
  	
+ 	ALIGN32
  ALTENTRY(fuibyte)
  ENTRY(fubyte)
  	movl	_curpcb,%ecx
***************
*** 906,911 ****
--- 943,949 ----
  	movl	$0,PCB_ONFAULT(%ecx)
  	ret
  	
+ 	ALIGN32
  fusufault:
  	movl	_curpcb,%ecx
  	xorl	%eax,%eax
***************
*** 913,918 ****
--- 951,957 ----
  	decl	%eax
  	ret
  
+ 	ALIGN32
  ALTENTRY(suiword)
  ENTRY(suword)
  	movl	_curpcb,%ecx
***************
*** 944,949 ****
--- 983,989 ----
  	movl	%eax,PCB_ONFAULT(%ecx) #in case we page/protection violate
  	ret
  	
+ 	ALIGN32
  ENTRY(susword)
  	movl	_curpcb,%ecx
  	movl	$fusufault,PCB_ONFAULT(%ecx) #in case we page/protection violate
***************
*** 972,977 ****
--- 1012,1018 ----
  	movl	%eax,PCB_ONFAULT(%ecx) #in case we page/protection violate
  	ret
  
+ 	ALIGN32
  ALTENTRY(suibyte)
  ENTRY(subyte)
  	movl	_curpcb,%ecx
***************
*** 1001,1006 ****
--- 1042,1048 ----
  	movl	%eax,PCB_ONFAULT(%ecx) #in case we page/protection violate
  	ret
  
+ 	ALIGN32
  	ENTRY(setjmp)
  	movl	4(%esp),%eax
  	movl	%ebx, 0(%eax)		# save ebx
***************
*** 1013,1018 ****
--- 1055,1061 ----
  	xorl	%eax,%eax		# return (0);
  	ret
  
+ 	ALIGN32
  	ENTRY(longjmp)
  	movl	4(%esp),%eax
  	movl	 0(%eax),%ebx		# restore ebx
***************
*** 1044,1049 ****
--- 1087,1093 ----
   *
   * Call should be made at spl6(), and p->p_stat should be SRUN
   */
+ 	ALIGN32
  ENTRY(setrq)
  	movl	4(%esp),%eax
  	cmpl	$0,P_RLINK(%eax)	# should not be on q already
***************
*** 1070,1075 ****
--- 1114,1120 ----
   *
   * Call should be made at spl6().
   */
+ 	ALIGN32
  ENTRY(remrq)
  	movl	4(%esp),%eax
  	movzbl	P_PRI(%eax),%edx
***************
*** 1106,1111 ****
--- 1151,1157 ----
   * to wait for something to come ready.
   */
  	.globl	Idle
+ 	ALIGN32
  Idle:
  idle:
  	call	_spl0
***************
*** 1123,1128 ****
--- 1169,1175 ----
  /*
   * Swtch()
   */
+ 	ALIGN32
  ENTRY(swtch)
  
  	incl	_cnt+V_SWTCH
***************
*** 1254,1259 ****
--- 1301,1307 ----
  	ret
  
  	.globl	_mvesp
+ 	ALIGN32
  _mvesp:	movl	%esp,%eax
  	ret
  /*
***************
*** 1265,1270 ****
--- 1313,1319 ----
   * Since this code requires a parameter from the "old" stack,
   * pass it back as a return value.
   */
+ 	ALIGN32
  ENTRY(swtch_to_inactive)
  	popl	%edx			# old pc
  	popl	%eax			# arg, our return value
***************
*** 1279,1284 ****
--- 1328,1334 ----
   * Update pcb, saving current processor state and arranging
   * for alternate return ala longjmp in swtch if altreturn is true.
   */
+ 	ALIGN32
  ENTRY(savectx)
  	movl	4(%esp), %ecx
  	movw	_cpl, %ax
***************
*** 1329,1334 ****
--- 1379,1385 ----
   * update profiling information for the user process.
   */
  
+ 	ALIGN32
  ENTRY(addupc)
  	pushl %ebp
  	movl %esp,%ebp
***************
*** 1358,1363 ****
--- 1409,1415 ----
  	leave
  	ret
  
+ 	ALIGN32
  proffault:
  	/* if we get a fault, then kill profiling all together */
  	movl $0,PCB_ONFAULT(%edx)	/* squish the fault handler */
***************
*** 1367,1372 ****
--- 1419,1425 ----
  	ret
  
  .data
+ 	ALIGN32
  	.globl	_cyloffset, _curpcb
  _cyloffset:	.long	0
  	.globl	_proc0paddr
***************
*** 1466,1471 ****
--- 1519,1525 ----
  IDTVEC(rsvd14)
  	pushl $0; TRAP(31)
  
+ 	ALIGN32
  alltraps:
  	pushal
  	nop
***************
*** 1491,1496 ****
--- 1545,1551 ----
   * This code checks for a kgdb trap, then falls through
   * to the regular trap code.
   */
+ 	ALIGN32
  bpttraps:
  	pushal
  	nop
***************
*** 1511,1516 ****
--- 1566,1572 ----
   * Call gate entry for syscall
   */
  
+ 	ALIGN32
  IDTVEC(syscall)
  	pushfl	# only for stupid carry bit and more stupid wait3 cc kludge
  	pushal	# only need eax,ecx,edx - trap resaves others
***************
*** 1529,1534 ****
--- 1585,1591 ----
  	popfl
  	lret
  
+ 	ALIGN32
  ENTRY(htonl)
  ENTRY(ntohl)
  	movl	4(%esp),%eax
***************
*** 1537,1542 ****
--- 1594,1600 ----
  	xchgb	%al,%ah
  	ret
  
+ 	ALIGN32
  ENTRY(htons)
  ENTRY(ntohs)
  	movzwl	4(%esp),%eax
diff -rc i386.ORIG/isa/icu.s i386/isa/icu.s
*** i386.ORIG/isa/icu.s	Tue May 12 20:21:27 1992
--- i386/isa/icu.s	Mon Oct 19 15:27:54 1992
***************
*** 43,48 ****
--- 43,49 ----
   */
  
  	.data
+ 	ALIGN32
  	.globl	_imen
  	.globl	_cpl
  _cpl:	.long	0xffff			# current priority level (all off)
***************
*** 62,67 ****
--- 63,69 ----
  /*
   * Handle return from interrupt after device handler finishes
   */
+ 	ALIGN32
  doreti:
  	cli
  	popl	%ebx			# remove intr number
***************
*** 89,94 ****
--- 91,97 ----
  	addl	$8,%esp
  	iret
  
+ 	ALIGN32
  1:	cmpl	$0,_netisr		# check for softint s/traps
  	jne	1f
  	cmpl	$0,_want_resched
***************
*** 102,107 ****
--- 105,111 ----
  	
  #include "../net/netisr.h"
  
+ 	ALIGN32
  1:
  
  #define DONET(s, c)	; \
***************
*** 171,176 ****
--- 175,181 ----
  
  	.globl	_splhigh
  	.globl	_splclock
+ 	ALIGN32
  _splhigh:
  _splclock:
  	cli				# disable interrupts
***************
*** 190,195 ****
--- 195,201 ----
  	ret
  
  	.globl	_spltty			# block clists
+ 	ALIGN32
  _spltty:
  	cli				# disable interrupts
  	NOP
***************
*** 210,215 ****
--- 216,222 ----
  
  	.globl	_splimp
  	.globl	_splnet
+ 	ALIGN32
  _splimp:
  _splnet:
  	cli				# disable interrupts
***************
*** 230,235 ****
--- 237,243 ----
  	ret
  
  	.globl	_splbio	
+ 	ALIGN32
  _splbio:
  	cli				# disable interrupts
  	NOP
***************
*** 249,254 ****
--- 257,263 ----
  	ret
  
  	.globl	_splsoftclock
+ 	ALIGN32
  _splsoftclock:
  	cli				# disable interrupts
  	NOP
***************
*** 269,274 ****
--- 278,284 ----
  
  	.globl _splnone
  	.globl _spl0
+ 	ALIGN32
  _splnone:
  _spl0:
  	cli				# disable interrupts
***************
*** 307,312 ****
--- 317,323 ----
  	ret
  
  	.globl _splx
+ 	ALIGN32
  _splx:
  	cli				# disable interrupts
  	NOP

--
UUCP: {hplabs!hp-sdd ucsd nosc}!crash!fpm
ARPA: crash!fpm@nosc.mil
INET: fpm@crash.cts.com