Sat Oct 14 02:52:36 1995 Ulrich Drepper <drepper@ipd.info.uni-karlsruhe.de>

* malloc/malloc.c (_malloc_internal): Performance fix.  Move
	if statement out of loop.

	* stdio/_itoa.c, stdio/_itoa.h: Complete rewrite.  Much faster
	implementation using GMP functions.  Contributed by
	Torbjorn Granlund and Ulrich Drepper.

	* stdio/test_rdwr.c: Include <errno.h>.

	* sysdeps/i386/i586/Implies: New file.

	New highly optimized string functions for i[345]86.
	* sysdeps/i386/memchr.S, sysdeps/i386/memcmp.S: New files.
        * sysdeps/i386/stpcpy.S, sysdeps/i386/stpncpy.S: New files.
        * sysdeps/i386/strchr.S, sysdeps/i386/strcspn.S: New files.
        * sysdeps/i386/strpbrk.S, sysdeps/i386/strrchr.S: New files.
        * sysdeps/i386/strspn.S, sysdeps/i386/i486/strcat.S: New files.
        * sysdeps/i386/i486/strlen.S, sysdeps/i386/i586/strchr.S: New files.
        * sysdeps/i386/i586/strlen.S: New file.
	* sysdeps/i386/memchr.c: Removed.  There is now an assembler version.

	* sysdeps/i386/i586/memcopy.h (WORD_COPY_BWD): Parameters did
	not correspond to used values.

	* sysdeps/unix/sysv/linux/nfs/nfs.h: New file.  Simply a wrapper
        around a kernel header file.
	* sysdeps/unix/sysv/linux/Dist: Add it.
	* sysdeps/unix/sysv/linux/Makefile [$(subdir)=sunrpc] (headers):
	Likewise.

	* sysdeps/unix/sysv/linux/local_lim.h: Rewrite.  Instead of
        defining ourself we use a kernel header file.

	* sysdeps/unix/sysv/linux/i386/sysdep.h (DO_CALL): Optimize system
        call handler for i586.

	* sysdeps/unix/sysv/linux/sys/param.h: Add copyright and clean up.
Sat Oct 14 02:52:36 1995  Ulrich Drepper  <drepper@ipd.info.uni-karlsruhe.de>

	* malloc/malloc.c (_malloc_internal): Performance fix.  Move
	if statement out of loop.

	* stdio/_itoa.c, stdio/_itoa.h: Complete rewrite.  Much faster
	implementation using GMP functions.  Contributed by
	Torbjorn Granlund and Ulrich Drepper.

	* stdio/test_rdwr.c: Include <errno.h>.

	* sysdeps/i386/i586/Implies: New file.

	New highly optimized string functions for i[345]86.
	* sysdeps/i386/memchr.S, sysdeps/i386/memcmp.S: New files.
        * sysdeps/i386/stpcpy.S, sysdeps/i386/stpncpy.S: New files.
        * sysdeps/i386/strchr.S, sysdeps/i386/strcspn.S: New files.
        * sysdeps/i386/strpbrk.S, sysdeps/i386/strrchr.S: New files.
        * sysdeps/i386/strspn.S, sysdeps/i386/i486/strcat.S: New files.
        * sysdeps/i386/i486/strlen.S, sysdeps/i386/i586/strchr.S: New files.
        * sysdeps/i386/i586/strlen.S: New file.
	* sysdeps/i386/memchr.c: Removed.  There is now an assembler version.

	* sysdeps/i386/i586/memcopy.h (WORD_COPY_BWD): Parameters did
	not correspond to used values.

	* sysdeps/unix/sysv/linux/nfs/nfs.h: New file.  Simply a wrapper
        around a kernel header file.
	* sysdeps/unix/sysv/linux/Dist: Add it.
	* sysdeps/unix/sysv/linux/Makefile [$(subdir)=sunrpc] (headers):
	Likewise.

	* sysdeps/unix/sysv/linux/local_lim.h: Rewrite.  Instead of
        defining ourself we use a kernel header file.

	* sysdeps/unix/sysv/linux/i386/sysdep.h (DO_CALL): Optimize system
        call handler for i586.

	* sysdeps/unix/sysv/linux/sys/param.h: Add copyright and clean up.
This commit is contained in:
Roland McGrath 1995-10-16 01:37:51 +00:00
parent 5d82cf5c55
commit 8f5ca04bc7
109 changed files with 9751 additions and 357 deletions

View File

@ -5,7 +5,7 @@ glibc-*
configparms
sun4 i386 i386-gnuelf hp300-netbsd hp300 i486-linux
sun[43]* i[345]86* hp300*
ieeetest hppa-sysdeps regex

View File

@ -1,3 +1,43 @@
Sat Oct 14 02:52:36 1995 Ulrich Drepper <drepper@ipd.info.uni-karlsruhe.de>
* malloc/malloc.c (_malloc_internal): Performance fix. Move
if statement out of loop.
* stdio/_itoa.c, stdio/_itoa.h: Complete rewrite. Much faster
implementation using GMP functions. Contributed by
Torbjorn Granlund and Ulrich Drepper.
* stdio/test_rdwr.c: Include <errno.h>.
* sysdeps/i386/i586/Implies: New file.
New highly optimized string functions for i[345]86.
* sysdeps/i386/memchr.S, sysdeps/i386/memcmp.S: New files.
* sysdeps/i386/stpcpy.S, sysdeps/i386/stpncpy.S: New files.
* sysdeps/i386/strchr.S, sysdeps/i386/strcspn.S: New files.
* sysdeps/i386/strpbrk.S, sysdeps/i386/strrchr.S: New files.
* sysdeps/i386/strspn.S, sysdeps/i386/i486/strcat.S: New files.
* sysdeps/i386/i486/strlen.S, sysdeps/i386/i586/strchr.S: New files.
* sysdeps/i386/i586/strlen.S: New file.
* sysdeps/i386/memchr.c: Removed. There is now an assembler version.
* sysdeps/i386/i586/memcopy.h (WORD_COPY_BWD): Parameters did
not correspond to used values.
* sysdeps/unix/sysv/linux/nfs/nfs.h: New file. Simply a wrapper
around a kernel header file.
* sysdeps/unix/sysv/linux/Dist: Add it.
* sysdeps/unix/sysv/linux/Makefile [$(subdir)=sunrpc] (headers):
Likewise.
* sysdeps/unix/sysv/linux/local_lim.h: Rewrite. Instead of
defining ourself we use a kernel header file.
* sysdeps/unix/sysv/linux/i386/sysdep.h (DO_CALL): Optimize system
call handler for i586.
* sysdeps/unix/sysv/linux/sys/param.h: Add copyright and clean up.
Wed Oct 11 00:00:00 1995 Roland McGrath <roland@churchy.gnu.ai.mit.edu>
* sysdeps/i386/dl-machine.h (elf_machine_rel): Use +=, not =, to

View File

@ -82,22 +82,18 @@ changequote(,)dnl
# Expand the configuration machine name into a subdirectory by architecture
# type and particular chip.
case "$machine" in
i[345]86)
machine=i386/$machine ;;
sparc[6789])
machine=sparc/$machine ;;
m68k)
machine=m68k/m68020 ;;
m680?0)
machine=m68k/$machine ;;
m88k)
machine=m88k/m88100 ;;
m88???)
machine=m88k/$machine ;;
mips64*)
machine=mips/mips64/$machine ;;
mips*)
machine=mips/$machine ;;
a29k | am29000) machine=a29k ;;
alpha*) machine=alpha/$machine ;;
hppa*) machine=hppa/$machine ;;
i[345]86) machine=i386/$machine ;;
m680?0) machine=m68k/$machine ;;
m68k) machine=m68k/m68020 ;;
m88???) machine=m88k/$machine ;;
m88k) machine=m88k/m88100 ;;
mips*) machine=mips/$machine ;;
mips64*) machine=mips/mips64/$machine ;;
sparc[6789]) machine=sparc/$machine ;;
supersparc) machine=sparc/sparc8 ;;
esac
# Make sco3.2v4 become sco3.2.4 and sunos4.1.1_U1 become sunos4.1.1.U1.

View File

@ -26,7 +26,7 @@ include ../Makeconfig
headers = hurd.h $(interface-headers) \
$(addprefix hurd/,fd.h id.h port.h signal.h userlink.h \
resource.h threadvar.h)
resource.h threadvar.h lookup.h)
distribute := hurdstartup.h hurdfault.h intr-rpc.defs STATUS
@ -44,7 +44,7 @@ routines = hurdstartup hurdinit \
setauth \
pid2task task2pid \
getuids setuids getumask fchroot \
hurdsock hurdauth invoke-trans \
hurdsock hurdauth \
privports \
msgportdemux \
fopenport \

View File

@ -77,11 +77,16 @@ extern struct hurd_port *_hurd_ports;
extern unsigned int _hurd_nports;
extern volatile mode_t _hurd_umask;
/* Shorthand macro for referencing _hurd_ports (see <hurd/port.h>). */
/* Shorthand macro for internal library code referencing _hurd_ports (see
<hurd/port.h>). */
#define __USEPORT(which, expr) \
HURD_PORT_USE (&_hurd_ports[INIT_PORT_##which], (expr))
/* Function version of __USEPORT: calls OPERATE with a send right. */
extern error_t _hurd_ports_use (int which, error_t (*operate) (mach_port_t));
/* Base address and size of the initial stack set up by the exec server.
If using cthreads, this stack is deallocated in startup.
@ -150,52 +155,6 @@ extern int setcttyid (mach_port_t);
extern int __setauth (auth_t), setauth (auth_t);
/* Split FILE into a directory and a name within the directory. Look up a
port for the directory and store it in *DIR; store in *NAME a pointer
into FILE where the name within directory begins. The directory lookup
uses CRDIR for the root directory and CWDIR for the current directory.
Returns zero on success or an error code. */
extern error_t __hurd_file_name_split (file_t crdir, file_t cwdir,
const char *file,
file_t *dir, char **name);
extern error_t hurd_file_name_split (file_t crdir, file_t cwdir,
const char *file,
file_t *dir, char **name);
/* Open a port to FILE with the given FLAGS and MODE (see <fcntl.h>).
The file lookup uses CRDIR for the root directory and CWDIR for the
current directory. If successful, returns zero and store the port
to FILE in *PORT; otherwise returns an error code. */
extern error_t __hurd_file_name_lookup (file_t crdir, file_t cwdir,
const char *file,
int flags, mode_t mode,
file_t *port);
extern error_t hurd_file_name_lookup (file_t crdir, file_t cwdir,
const char *filename,
int flags, mode_t mode,
file_t *port);
/* Process the values returned by `dir_lookup' et al, and loop doing
`dir_lookup' calls until one returns FS_RETRY_NONE. CRDIR is the
root directory used for things like symlinks to absolute file names; the
other arguments should be those just passed to and/or returned from
`dir_lookup', `fsys_getroot', or `file_invoke_translator'. This
function consumes the reference in *RESULT even if it returns an error. */
extern error_t __hurd_file_name_lookup_retry (file_t crdir,
enum retry_type doretry,
char retryname[1024],
int flags, mode_t mode,
file_t *result);
extern error_t hurd_file_name_lookup_retry (file_t crdir,
enum retry_type doretry,
char retryname[1024],
int flags, mode_t mode,
file_t *result);
/* Split FILE into a directory and a name within the directory. The
directory lookup uses the current root and working directory. If
successful, stores in *NAME a pointer into FILE where the name
@ -213,15 +172,15 @@ extern file_t file_name_split (const char *file, char **name);
extern file_t __file_name_lookup (const char *file, int flags, mode_t mode);
extern file_t file_name_lookup (const char *file, int flags, mode_t mode);
/* Invoke any translator set on the node FILE represents, and return in
*TRANSLATED a port to the translated node. FLAGS are as for
`dir_lookup' et al, but the returned port will not necessarily have
any more access rights than FILE does. */
/* Open a port to FILE with the given FLAGS and MODE (see <fcntl.h>). The
file lookup uses the current root directory, but uses STARTDIR as the
"working directory" for file relative names. Returns a port to the file
if successful; otherwise sets `errno' and returns MACH_PORT_NULL. */
extern error_t __hurd_invoke_translator (file_t file, int flags,
file_t *translated);
extern error_t hurd_invoke_translator (file_t file, int flags,
file_t *translated);
extern file_t __file_name_lookup_under (file_t startdir, const char *file,
int flags, mode_t mode);
extern file_t file_name_lookup_under (file_t startdir, const char *file,
int flags, mode_t mode);
/* Open a file descriptor on a port. FLAGS are as for `open'; flags

View File

@ -31,6 +31,12 @@ struct hurd_port *_hurd_ports;
unsigned int _hurd_nports;
mode_t _hurd_umask;
error_t
_hurd_ports_use (int which, error_t (*operate) (mach_port_t))
{
return HURD_PORT_USE (&_hurd_ports[which], (*operate) (port));
}
void _hurd_proc_init (char **argv);
DEFINE_HOOK (_hurd_subinit, (void));

View File

@ -1,6 +1,8 @@
/* Internal function for converting integers to ASCII.
Copyright (C) 1994, 1995 Free Software Foundation, Inc.
This file is part of the GNU C Library.
Contributed by Torbjorn Granlund <tege@matematik.su.se>
and Ulrich Drepper <drepper@gnu.ai.mit.edu>.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Library General Public License as
@ -17,13 +19,400 @@ License along with the GNU C Library; see the file COPYING.LIB. If
not, write to the Free Software Foundation, Inc., 675 Mass Ave,
Cambridge, MA 02139, USA. */
/* Lower-case digits. */
const char _itoa_lower_digits[] = "0123456789abcdefghijklmnopqrstuvwxyz";
/* Upper-case digits. */
const char _itoa_upper_digits[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ";
/* Cause _itoa.h to define _itoa as a real function instead of an
`extern inline'. */
#define _EXTERN_INLINE /* empty */
#include <gmp-mparam.h>
#include "../stdlib/gmp.h"
#include "../stdlib/gmp-impl.h"
#include "../stdlib/longlong.h"
#include "_itoa.h"
/* Canonize environment. For some architectures not all values might
be defined in the GMP header files. */
#ifndef UMUL_TIME
# define UMUL_TIME 1
#endif
#ifndef UDIV_TIME
# define UDIV_TIME 1
#endif
/* Control memory layout. */
#ifdef PACK
# undef PACK
# define PACK __attribute__ ((packed))
#else
# define PACK
#endif
/* Declare local types. */
struct base_table_t
{
#if (UDIV_TIME > 2 * UMUL_TIME)
mp_limb base_multiplier;
#endif
char flag;
char post_shift;
#if BITS_PER_MP_LIMB == 32
struct
{
char normalization_steps;
char ndigits;
mp_limb base PACK;
#if UDIV_TIME > 2 * UMUL_TIME
mp_limb base_ninv PACK;
#endif
} big;
#endif
};
/* To reduce the memory needed we include some fields of the tables
only confitionally. */
#if BITS_PER_MP_LIMB == 32
# if UDIV_TIME > 2 * UMUL_TIME
# define SEL1(X) X,
# define SEL2(X) ,X
# else
# define SEL1(X)
# define SEL2(X)
# endif
#endif
/* Local variables. */
static const struct base_table_t base_table[] =
{
#if BITS_PER_MP_LIMB == 64
/* 2 */ {0ul, 1, 1},
/* 3 */ {0xaaaaaaaaaaaaaaabul, 0, 1},
/* 4 */ {0ul, 1, 2},
/* 5 */ {0xcccccccccccccccdul, 0, 2},
/* 6 */ {0xaaaaaaaaaaaaaaabul, 0, 2},
/* 7 */ {0x2492492492492493ul, 1, 3},
/* 8 */ {0ul, 1, 3},
/* 9 */ {0xe38e38e38e38e38ful, 0, 3},
/* 10 */ {0xcccccccccccccccdul, 0, 3},
/* 11 */ {0x2e8ba2e8ba2e8ba3ul, 0, 1},
/* 12 */ {0xaaaaaaaaaaaaaaabul, 0, 3},
/* 13 */ {0x4ec4ec4ec4ec4ec5ul, 0, 2},
/* 14 */ {0x2492492492492493ul, 1, 4},
/* 15 */ {0x8888888888888889ul, 0, 3},
/* 16 */ {0ul, 1, 4},
/* 17 */ {0xf0f0f0f0f0f0f0f1ul, 0, 4},
/* 18 */ {0xe38e38e38e38e38ful, 0, 4},
/* 19 */ {0xd79435e50d79435ful, 0, 4},
/* 20 */ {0xcccccccccccccccdul, 0, 4},
/* 21 */ {0x8618618618618619ul, 1, 5},
/* 22 */ {0x2e8ba2e8ba2e8ba3ul, 0, 2},
/* 23 */ {0x642c8590b21642c9ul, 1, 5},
/* 24 */ {0xaaaaaaaaaaaaaaabul, 0, 4},
/* 25 */ {0x47ae147ae147ae15ul, 1, 5},
/* 26 */ {0x4ec4ec4ec4ec4ec5ul, 0, 3},
/* 27 */ {0x97b425ed097b425ful, 0, 4},
/* 28 */ {0x2492492492492493ul, 1, 5},
/* 29 */ {0x1a7b9611a7b9611bul, 1, 5},
/* 30 */ {0x8888888888888889ul, 0, 4},
/* 31 */ {0x0842108421084211ul, 1, 5},
/* 32 */ {0ul, 1, 5},
/* 33 */ {0x0f83e0f83e0f83e1ul, 0, 1},
/* 34 */ {0xf0f0f0f0f0f0f0f1ul, 0, 5},
/* 35 */ {0xea0ea0ea0ea0ea0ful, 0, 5},
/* 36 */ {0xe38e38e38e38e38ful, 0, 5}
#endif
#if BITS_PER_MP_LIMB == 32
/* 2 */ {SEL1(0ul) 1, 1, {0, 31, 0x80000000ul SEL2(0xfffffffful)}},
/* 3 */ {SEL1(0xaaaaaaabul) 0, 1, {0, 20, 0xcfd41b91ul SEL2(0x3b563c24ul)}},
/* 4 */ {SEL1(0ul) 1, 2, {1, 15, 0x40000000ul SEL2(0xfffffffful)}},
/* 5 */ {SEL1(0xcccccccdul) 0, 2, {1, 13, 0x48c27395ul SEL2(0xc25c2684ul)}},
/* 6 */ {SEL1(0xaaaaaaabul) 0, 2, {0, 12, 0x81bf1000ul SEL2(0xf91bd1b6ul)}},
/* 7 */ {SEL1(0x24924925ul) 1, 3, {1, 11, 0x75db9c97ul SEL2(0x1607a2cbul)}},
/* 8 */ {SEL1(0ul) 1, 3, {1, 10, 0x40000000ul SEL2(0xfffffffful)}},
/* 9 */ {SEL1(0x38e38e39ul) 0, 1, {0, 10, 0xcfd41b91ul SEL2(0x3b563c24ul)}},
/* 10 */ {SEL1(0xcccccccdul) 0, 3, {2, 9, 0x3b9aca00ul SEL2(0x12e0be82ul)}},
/* 11 */ {SEL1(0xba2e8ba3ul) 0, 3, {0, 9, 0x8c8b6d2bul SEL2(0xd24cde04ul)}},
/* 12 */ {SEL1(0xaaaaaaabul) 0, 3, {3, 8, 0x19a10000ul SEL2(0x3fa39ab5ul)}},
/* 13 */ {SEL1(0x4ec4ec4ful) 0, 2, {2, 8, 0x309f1021ul SEL2(0x50f8ac5ful)}},
/* 14 */ {SEL1(0x24924925ul) 1, 4, {1, 8, 0x57f6c100ul SEL2(0x74843b1eul)}},
/* 15 */ {SEL1(0x88888889ul) 0, 3, {0, 8, 0x98c29b81ul SEL2(0xad0326c2ul)}},
/* 16 */ {SEL1(0ul) 1, 4, {3, 7, 0x10000000ul SEL2(0xfffffffful)}},
/* 17 */ {SEL1(0xf0f0f0f1ul) 0, 4, {3, 7, 0x18754571ul SEL2(0x4ef0b6bdul)}},
/* 18 */ {SEL1(0x38e38e39ul) 0, 2, {2, 7, 0x247dbc80ul SEL2(0xc0fc48a1ul)}},
/* 19 */ {SEL1(0xaf286bcbul) 1, 5, {2, 7, 0x3547667bul SEL2(0x33838942ul)}},
/* 20 */ {SEL1(0xcccccccdul) 0, 4, {1, 7, 0x4c4b4000ul SEL2(0xad7f29abul)}},
/* 21 */ {SEL1(0x86186187ul) 1, 5, {1, 7, 0x6b5a6e1dul SEL2(0x313c3d15ul)}},
/* 22 */ {SEL1(0xba2e8ba3ul) 0, 4, {0, 7, 0x94ace180ul SEL2(0xb8cca9e0ul)}},
/* 23 */ {SEL1(0xb21642c9ul) 0, 4, {0, 7, 0xcaf18367ul SEL2(0x42ed6de9ul)}},
/* 24 */ {SEL1(0xaaaaaaabul) 0, 4, {4, 6, 0x0b640000ul SEL2(0x67980e0bul)}},
/* 25 */ {SEL1(0x51eb851ful) 0, 3, {4, 6, 0x0e8d4a51ul SEL2(0x19799812ul)}},
/* 26 */ {SEL1(0x4ec4ec4ful) 0, 3, {3, 6, 0x1269ae40ul SEL2(0xbce85396ul)}},
/* 27 */ {SEL1(0x2f684bdbul) 1, 5, {3, 6, 0x17179149ul SEL2(0x62c103a9ul)}},
/* 28 */ {SEL1(0x24924925ul) 1, 5, {3, 6, 0x1cb91000ul SEL2(0x1d353d43ul)}},
/* 29 */ {SEL1(0x8d3dcb09ul) 0, 4, {2, 6, 0x23744899ul SEL2(0xce1deceaul)}},
/* 30 */ {SEL1(0x88888889ul) 0, 4, {2, 6, 0x2b73a840ul SEL2(0x790fc511ul)}},
/* 31 */ {SEL1(0x08421085ul) 1, 5, {2, 6, 0x34e63b41ul SEL2(0x35b865a0ul)}},
/* 32 */ {SEL1(0ul) 1, 5, {1, 6, 0x40000000ul SEL2(0xfffffffful)}},
/* 33 */ {SEL1(0x3e0f83e1ul) 0, 3, {1, 6, 0x4cfa3cc1ul SEL2(0xa9aed1b3ul)}},
/* 34 */ {SEL1(0xf0f0f0f1ul) 0, 5, {1, 6, 0x5c13d840ul SEL2(0x63dfc229ul)}},
/* 35 */ {SEL1(0xd41d41d5ul) 1, 6, {1, 6, 0x6d91b519ul SEL2(0x2b0fee30ul)}},
/* 36 */ {SEL1(0x38e38e39ul) 0, 3, {0, 6, 0x81bf1000ul SEL2(0xf91bd1b6ul)}}
#endif
};
/* Lower-case digits. */
static const char _itoa_lower_digits[]
= "0123456789abcdefghijklmnopqrstuvwxyz";
/* Upper-case digits. */
static const char _itoa_upper_digits[]
= "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ";
char *
_itoa (value, buflim, base, upper_case)
unsigned long long int value;
char *buflim;
unsigned int base;
int upper_case;
{
const char *digits = upper_case ? _itoa_upper_digits : _itoa_lower_digits;
char *bp = buflim;
const struct base_table_t *brec = &base_table[base - 2];
switch (base)
{
#define RUN_2N(BITS) \
do \
{ \
/* `unsigned long long int' always has 64 bits. */ \
mp_limb work_hi = value >> (64 - BITS_PER_MP_LIMB); \
\
if (BITS_PER_MP_LIMB == 32) \
if (work_hi != 0) \
{ \
mp_limb work_lo; \
int cnt; \
\
work_lo = value & 0xfffffffful; \
for (cnt = BITS_PER_MP_LIMB / BITS; cnt > 0; --cnt) \
{ \
*--bp = digits[work_lo & ((1ul << BITS) - 1)]; \
work_lo >>= BITS; \
} \
if (BITS_PER_MP_LIMB % BITS != 0) \
{ \
work_lo |= ((work_hi \
& ((1 << BITS - BITS_PER_MP_LIMB % BITS) \
- 1)) \
<< BITS_PER_MP_LIMB % BITS); \
*--bp = digits[work_lo]; \
work_hi >>= BITS - BITS_PER_MP_LIMB % BITS; \
} \
} \
else \
work_hi = value & 0xfffffffful; \
do \
{ \
*--bp = digits[work_hi & ((1 << BITS) - 1)]; \
work_hi >>= BITS; \
} \
while (work_hi != 0); \
} \
while (0)
case 8:
RUN_2N (3);
break;
case 16:
RUN_2N (4);
break;
default:
{
#if BITS_PER_MP_LIMB == 64
mp_limb base_multiplier = brec->base_multiplier;
if (brec->flag)
while (value != 0)
{
mp_limb quo, rem, x, dummy;
umul_ppmm (x, dummy, value, base_multiplier);
quo = (x + ((value - x) >> 1)) >> (brec->post_shift - 1);
rem = value - quo * base;
*--bp = digits[rem];
value = quo;
}
else
while (value != 0)
{
mp_limb quo, rem, x, dummy;
umul_ppmm (x, dummy, value, base_multiplier);
quo = x >> brec->post_shift;
rem = value - quo * base;
*--bp = digits[rem];
value = quo;
}
#endif
#if BITS_PER_MP_LIMB == 32
mp_limb t[3];
int n;
/* First convert x0 to 1-3 words in base s->big.base.
Optimize for frequent cases of 32 bit numbers. */
if ((mp_limb) (value >> 32) >= 1)
{
int big_normalization_steps = brec->big.normalization_steps;
mp_limb big_base_norm = brec->big.base << big_normalization_steps;
if ((mp_limb) (value >> 32) >= brec->big.base)
{
mp_limb x1hi, x1lo, r;
/* If you want to optimize this, take advantage of
that the quotient in the first udiv_qrnnd will
always be very small. It might be faster just to
subtract in a tight loop. */
#if UDIV_TIME > 2 * UMUL_TIME
mp_limb x, xh, xl;
if (big_normalization_steps == 0)
xh = 0;
else
xh = (mp_limb) (value >> 64 - big_normalization_steps);
xl = (mp_limb) (value >> 32 - big_normalization_steps);
udiv_qrnnd_preinv (x1hi, r, xh, xl, big_base_norm,
brec->big.base_ninv);
xl = ((mp_limb) value) << big_normalization_steps;
udiv_qrnnd_preinv (x1lo, x, r, xl, big_base_norm,
big_normalization_steps);
t[2] = x >> big_normalization_steps;
if (big_normalization_steps == 0)
xh = x1hi;
else
xh = ((x1hi << big_normalization_steps)
| (x1lo >> 32 - big_normalization_steps));
xl = x1lo << big_normalization_steps;
udiv_qrnnd_preinv (t[0], x, xh, xl, big_base_norm,
big_normalization_steps);
t[1] = x >> big_normalization_steps;
#elif UDIV_NEEDS_NORMALIZATION
mp_limb x, xh, xl;
if (big_normalization_steps == 0)
xh = 0;
else
xh = (mp_limb) (value >> 64 - big_normalization_steps);
xl = (mp_limb) (value >> 32 - big_normalization_steps);
udiv_qrnnd (x1hi, r, xh, xl, big_base_norm);
xl = ((mp_limb) value) << big_normalization_steps;
udiv_qrnnd (x1lo, x, r, xl, big_base_norm);
t[2] = x >> big_normalization_steps;
if (big_normalization_steps == 0)
xh = x1hi;
else
xh = ((x1hi << big_normalization_steps)
| (x1lo >> 32 - big_normalization_steps));
xl = x1lo << big_normalization_steps;
udiv_qrnnd (t[0], x, xh, xl, big_base_norm);
t[1] = x >> big_normalization_steps;
#else
udiv_qrnnd (x1hi, r, 0, (mp_limb) (value >> 32),
brec->big.base);
udiv_qrnnd (x1lo, t[2], r, (mp_limb) value, brec->big.base);
udiv_qrnnd (t[0], t[1], x1hi, x1lo, brec->big.base);
#endif
n = 3;
}
else
{
#if (UDIV_TIME > 2 * UMUL_TIME)
mp_limb x;
value <<= brec->big.normalization_steps;
udiv_qrnnd_preinv (t[0], x, (mp_limb) (value >> 32),
(mp_limb) value, big_base_norm,
brec->big.base_ninv);
t[1] = x >> brec->big.normalization_steps;
#elif UDIV_NEEDS_NORMALIZATION
mp_limb x;
value <<= big_normalization_steps;
udiv_qrnnd (t[0], x, (mp_limb) (value >> 32),
(mp_limb) value, big_base_norm);
t[1] = x >> big_normalization_steps;
#else
udiv_qrnnd (t[0], t[1], (mp_limb) (value >> 32),
(mp_limb) value, brec->big.base);
#endif
n = 2;
}
}
else
{
t[0] = value;
n = 1;
}
/* Convert the 1-3 words in t[], word by word, to ASCII. */
do
{
mp_limb ti = t[--n];
int ndig_for_this_limb = 0;
#if UDIV_TIME > 2 * UMUL_TIME
mp_limb base_multiplier = brec->base_multiplier;
if (brec->flag)
while (ti != 0)
{
mp_limb quo, rem, x, dummy;
umul_ppmm (x, dummy, ti, base_multiplier);
quo = (x + ((ti - x) >> 1)) >> (brec->post_shift - 1);
rem = ti - quo * base;
*--bp = digits[rem];
ti = quo;
++ndig_for_this_limb;
}
else
while (ti != 0)
{
mp_limb quo, rem, x, dummy;
umul_ppmm (x, dummy, ti, base_multiplier);
quo = x >> brec->post_shift;
rem = ti - quo * base;
*--bp = digits[rem];
ti = quo;
++ndig_for_this_limb;
}
#else
while (ti != 0)
{
mp_limb quo, rem;
quo = ti / base;
rem = ti % base;
*--bp = digits[rem];
ti = quo;
++ndig_for_this_limb;
}
#endif
/* If this wasn't the most significant word, pad with zeros. */
if (n != 0)
while (ndig_for_this_limb < brec->big.ndigits)
{
*--bp = '0';
++ndig_for_this_limb;
}
}
while (n != 0);
#endif
}
break;
}
return bp;
}

View File

@ -21,8 +21,6 @@ Cambridge, MA 02139, USA. */
#define _ITOA_H
#include <sys/cdefs.h>
extern const char _itoa_lower_digits[], _itoa_upper_digits[];
/* Convert VALUE into ASCII in base BASE (2..36).
Write backwards starting the character just before BUFLIM.
Return the address of the first (left-to-right) character in the number.
@ -31,28 +29,4 @@ extern const char _itoa_lower_digits[], _itoa_upper_digits[];
extern char *_itoa __P ((unsigned long long int value, char *buflim,
unsigned int base, int upper_case));
#ifndef _EXTERN_INLINE
#define _EXTERN_INLINE extern __inline
#endif
_EXTERN_INLINE
char *
_itoa (unsigned long long int value, char *buflim,
unsigned int base, int upper_case)
{
/* Base-36 digits for numbers. */
const char *digits = upper_case ? _itoa_upper_digits : _itoa_lower_digits;
register char *bp = buflim;
while (value > 0)
{
*--bp = digits[value % base];
value /= base;
}
return bp;
}
#endif /* itoa.h */

View File

@ -17,6 +17,7 @@ not, write to the Free Software Foundation, Inc., 675 Mass Ave,
Cambridge, MA 02139, USA. */
#include <ansidecl.h>
#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

View File

@ -19,11 +19,17 @@ along with the GNU MP Library; see the file COPYING.LIB. If not, write to
the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
#if ! defined (alloca)
#if defined (__GNUC__) || defined (__sparc__) || defined (sparc)
#if defined (__GNUC__)
#define alloca __builtin_alloca
#endif
#endif
#if ! defined (alloca)
#if defined (__sparc__) || defined (sparc) || defined (__sgi)
#include <alloca.h>
#endif
#endif
#ifndef NULL
#define NULL 0L
#endif
@ -168,6 +174,7 @@ void _mp_default_free ();
else \
____mpn_sqr_n (prodp, up, size, tspace); \
} while (0);
#define assert(trueval) do {if (!(trueval)) abort ();} while (0)
/* Structure for conversion between internal binary format and
strings in base 2..36. */
@ -197,9 +204,11 @@ struct bases
extern const struct bases __mp_bases[];
extern mp_size_t __gmp_default_fp_limb_precision;
/* Divide the two-limb number in (NH,,NL) by D, with DI being a 32 bit
approximation to (2**(2*BITS_PER_MP_LIMB))/D - (2**BITS_PER_MP_LIMB).
Put the quotient in Q and the remainder in R. */
/* Divide the two-limb number in (NH,,NL) by D, with DI being the largest
limb not larger than (2**(2*BITS_PER_MP_LIMB))/D - (2**BITS_PER_MP_LIMB).
If this would yield overflow, DI should be the largest possible number
(i.e., only ones). For correct operation, the most significant bit of D
has to be set. Put the quotient in Q and the remainder in R. */
#define udiv_qrnnd_preinv(q, r, nh, nl, d, di) \
do { \
mp_limb _q, _ql, _r; \
@ -226,6 +235,8 @@ extern mp_size_t __gmp_default_fp_limb_precision;
(r) = _r; \
(q) = _q; \
} while (0)
/* Like udiv_qrnnd_preinv, but for for any value D. DNORM is D shifted left
so that its most significant bit is set. LGUP is ceil(log2(D)). */
#define udiv_qrnnd_preinv2gen(q, r, nh, nl, d, di, dnorm, lgup) \
do { \
mp_limb n2, n10, n1, nadj, q1; \
@ -243,6 +254,8 @@ extern mp_size_t __gmp_default_fp_limb_precision;
(r) = _xl + ((d) & _xh); \
(q) = _xh - q1; \
} while (0)
/* Exactly like udiv_qrnnd_preinv, but branch-free. It is not clear which
version to use. */
#define udiv_qrnnd_preinv2norm(q, r, nh, nl, d, di) \
do { \
mp_limb n2, n10, n1, nadj, q1; \
@ -262,22 +275,49 @@ extern mp_size_t __gmp_default_fp_limb_precision;
} while (0)
#if defined (__GNUC__)
/* Define stuff for longlong.h asm macros. */
#if __GNUC_NEW_ATTR_MODE_SYNTAX
typedef unsigned int UQItype __attribute__ ((mode ("QI")));
typedef int SItype __attribute__ ((mode ("SI")));
typedef unsigned int USItype __attribute__ ((mode ("SI")));
typedef int DItype __attribute__ ((mode ("DI")));
typedef unsigned int UDItype __attribute__ ((mode ("DI")));
#else
/* Define stuff for longlong.h. */
typedef unsigned int UQItype __attribute__ ((mode (QI)));
typedef int SItype __attribute__ ((mode (SI)));
typedef unsigned int USItype __attribute__ ((mode (SI)));
typedef int DItype __attribute__ ((mode (DI)));
typedef unsigned int UDItype __attribute__ ((mode (DI)));
#endif
#else
typedef unsigned char UQItype;
typedef long SItype;
typedef unsigned long USItype;
#endif
typedef mp_limb UWtype;
typedef unsigned int UHWtype;
#define W_TYPE_SIZE BITS_PER_MP_LIMB
#ifndef IEEE_DOUBLE_BIG_ENDIAN
#define IEEE_DOUBLE_BIG_ENDIAN 1
#endif
#if IEEE_DOUBLE_BIG_ENDIAN
union ieee_double_extract
{
struct
{
unsigned long sig:1;
unsigned long exp:11;
unsigned long manh:20;
unsigned long manl:32;
} s;
double d;
};
#else
union ieee_double_extract
{
struct
{
unsigned long manl:32;
unsigned long manh:20;
unsigned long exp:11;
unsigned long sig:1;
} s;
double d;
};
#endif

View File

@ -24,13 +24,13 @@ the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
#define __need_size_t
#include <stddef.h>
#ifdef __STDC__
#if defined (__STDC__)
#define __gmp_const const
#else
#define __gmp_const
#endif
#ifdef __GNUC__
#if defined (__GNUC__)
#define __gmp_inline inline
#else
#define __gmp_inline
@ -40,9 +40,14 @@ the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
typedef unsigned int mp_limb;
typedef int mp_limb_signed;
#else
#if _LONG_LONG_LIMB
typedef unsigned long long int mp_limb;
typedef long long int mp_limb_signed;
#else
typedef unsigned long int mp_limb;
typedef long int mp_limb_signed;
#endif
#endif
typedef mp_limb * mp_ptr;
typedef __gmp_const mp_limb * mp_srcptr;
@ -52,9 +57,9 @@ typedef long int mp_exp_t;
#ifndef __MP_SMALL__
typedef struct
{
long int alloc; /* Number of *limbs* allocated and pointed
mp_size_t alloc; /* Number of *limbs* allocated and pointed
to by the D field. */
long int size; /* abs(SIZE) is the number of limbs
mp_size_t size; /* abs(SIZE) is the number of limbs
the last field points to. If SIZE
is negative this is a negative
number. */
@ -130,12 +135,16 @@ typedef __mpf_struct *mpf_ptr;
typedef __gmp_const __mpq_struct *mpq_srcptr;
typedef __mpq_struct *mpq_ptr;
#ifdef __STDC__
#if defined (__STDC__)
#define _PROTO(x) x
#else
#define _PROTO(x) ()
#endif
#if defined (FILE) || defined (_STDIO_H_) || defined (__STDIO_H__) || defined (H_STDIO)
#define _GMP_H_HAVE_FILE 1
#endif
void mp_set_memory_functions _PROTO((void *(*) (size_t),
void *(*) (void *, size_t, size_t),
void (*) (void *, size_t)));
@ -165,7 +174,7 @@ unsigned long int mpz_get_ui _PROTO ((mpz_srcptr));
mp_limb mpz_getlimbn _PROTO ((mpz_srcptr, mp_size_t));
mp_size_t mpz_hamdist _PROTO ((mpz_srcptr, mpz_srcptr));
void mpz_init _PROTO ((mpz_ptr));
#ifdef FILE
#ifdef _GMP_H_HAVE_FILE
void mpz_inp_raw _PROTO ((mpz_ptr, FILE *));
int mpz_inp_str _PROTO ((mpz_ptr, FILE *, int));
#endif
@ -180,7 +189,7 @@ void mpz_mul _PROTO ((mpz_ptr, mpz_srcptr, mpz_srcptr));
void mpz_mul_2exp _PROTO ((mpz_ptr, mpz_srcptr, unsigned long int));
void mpz_mul_ui _PROTO ((mpz_ptr, mpz_srcptr, unsigned long int));
void mpz_neg _PROTO ((mpz_ptr, mpz_srcptr));
#ifdef FILE
#ifdef _GMP_H_HAVE_FILE
void mpz_out_raw _PROTO ((FILE *, mpz_srcptr));
void mpz_out_str _PROTO ((FILE *, int, mpz_srcptr));
#endif
@ -218,6 +227,8 @@ void mpz_tdiv_qr_ui _PROTO((mpz_ptr, mpz_ptr, mpz_srcptr, unsigned long int));
void mpz_tdiv_r _PROTO((mpz_ptr, mpz_srcptr, mpz_srcptr));
void mpz_tdiv_r_ui _PROTO((mpz_ptr, mpz_srcptr, unsigned long int));
void mpz_array_init (mpz_ptr, size_t, mp_size_t);
/**************** Rational (i.e. Q) routines. ****************/
void mpq_init _PROTO ((mpq_ptr));
@ -253,7 +264,7 @@ void mpf_dump _PROTO ((mpf_srcptr));
char *mpf_get_str _PROTO ((char *, mp_exp_t *, int, size_t, mpf_srcptr));
void mpf_init _PROTO ((mpf_ptr));
void mpf_init2 _PROTO ((mpf_ptr, mp_size_t));
#ifdef FILE
#ifdef _GMP_H_HAVE_FILE
void mpf_inp_str _PROTO ((mpf_ptr, FILE *, int));
#endif
void mpf_init_set _PROTO ((mpf_ptr, mpf_srcptr));
@ -265,7 +276,7 @@ void mpf_mul _PROTO ((mpf_ptr, mpf_srcptr, mpf_srcptr));
void mpf_mul_2exp _PROTO ((mpf_ptr, mpf_srcptr, unsigned long int));
void mpf_mul_ui _PROTO ((mpf_ptr, mpf_srcptr, unsigned long int));
void mpf_neg _PROTO ((mpf_ptr, mpf_srcptr));
#ifdef FILE
#ifdef _GMP_H_HAVE_FILE
void mpf_out_str _PROTO ((mpf_ptr, int, size_t, FILE *));
#endif
void mpf_set _PROTO ((mpf_ptr, mpf_srcptr));
@ -335,7 +346,7 @@ mp_limb __mpn_gcd_1 _PROTO ((mp_srcptr, mp_size_t, mp_limb));
static __gmp_inline mp_limb
#if __STDC__
#if defined (__STDC__)
__mpn_add_1 (register mp_ptr res_ptr,
register mp_srcptr s1_ptr,
register mp_size_t s1_size,
@ -377,7 +388,7 @@ __mpn_add_1 (res_ptr, s1_ptr, s1_size, s2_limb)
}
static __gmp_inline mp_limb
#if __STDC__
#if defined (__STDC__)
__mpn_add (register mp_ptr res_ptr,
register mp_srcptr s1_ptr,
register mp_size_t s1_size,
@ -406,7 +417,7 @@ __mpn_add (res_ptr, s1_ptr, s1_size, s2_ptr, s2_size)
}
static __gmp_inline mp_limb
#if __STDC__
#if defined (__STDC__)
__mpn_sub_1 (register mp_ptr res_ptr,
register mp_srcptr s1_ptr,
register mp_size_t s1_size,
@ -448,7 +459,7 @@ __mpn_sub_1 (res_ptr, s1_ptr, s1_size, s2_limb)
}
static __gmp_inline mp_limb
#if __STDC__
#if defined (__STDC__)
__mpn_sub (register mp_ptr res_ptr,
register mp_srcptr s1_ptr,
register mp_size_t s1_size,
@ -477,7 +488,7 @@ __mpn_sub (res_ptr, s1_ptr, s1_size, s2_ptr, s2_size)
}
static __gmp_inline mp_size_t
#if __STDC__
#if defined (__STDC__)
__mpn_normal_size (mp_srcptr ptr, mp_size_t size)
#else
__mpn_normal_size (ptr, size)
@ -512,7 +523,6 @@ __mpn_normal_size (ptr, size)
/* Useful synonyms, but not quite compatible with GMP 1. */
#define mpz_div mpz_fdiv_q
#define mpz_divmod mpz_fdiv_qr
#define mpz_mod mpz_fdiv_r
#define mpz_div_ui mpz_fdiv_q_ui
#define mpz_divmod_ui mpz_fdiv_qr_ui
#define mpz_mod_ui mpz_fdiv_r_ui

View File

@ -97,7 +97,7 @@ the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
#define __AND_CLOBBER_CC , "cc"
#endif /* __GNUC__ < 2 */
#if (defined (__a29k__) || defined (___AM29K__)) && W_TYPE_SIZE == 32
#if (defined (__a29k__) || defined (_AM29K)) && W_TYPE_SIZE == 32
#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
__asm__ ("add %1,%4,%5
addc %0,%2,%3" \
@ -152,6 +152,7 @@ the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
(pl) = __m0 * __m1; \
} while (0)
#define UMUL_TIME 46
#ifndef LONGLONG_STANDALONE
#define udiv_qrnnd(q, r, n1, n0, d) \
do { UDItype __r; \
(q) = __udiv_qrnnd (&__r, (n1), (n0), (d)); \
@ -159,7 +160,8 @@ the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
} while (0)
extern UDItype __udiv_qrnnd ();
#define UDIV_TIME 220
#endif
#endif /* LONGLONG_STANDALONE */
#endif /* __alpha__ */
#if defined (__arm__) && W_TYPE_SIZE == 32
#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
@ -181,19 +183,19 @@ extern UDItype __udiv_qrnnd ();
"r" ((USItype)(al)), \
"rI" ((USItype)(bl)))
#define umul_ppmm(xh, xl, a, b) \
__asm__ ("; Inlined umul_ppmm
mov r0,%2 lsr 16
mov r2,%3 lsr 16
bic r1,%2,r0 lsl 16
bic r2,%3,r2 lsl 16
mul %1,r1,r2
mul r2,r0,r2
mul r1,%0,r1
mul %0,r0,%0
adds r1,r2,r1
addcs %0,%0,0x10000
adds %1,%1,r1 lsl 16
adc %0,%0,r1 lsr 16" \
__asm__ ("%@ Inlined umul_ppmm
mov %|r0, %2, lsr #16
mov %|r2, %3, lsr #16
bic %|r1, %2, %|r0, lsl #16
bic %|r2, %3, %|r2, lsl #16
mul %1, %|r1, %|r2
mul %|r2, %|r0, %|r2
mul %|r1, %0, %|r1
mul %0, %|r0, %0
adds %|r1, %|r2, %|r1
addcs %0, %0, #65536
adds %1, %1, %|r1, lsl #16
adc %0, %0, %|r1, lsr #16" \
: "=&r" ((USItype)(xh)), \
"=r" ((USItype)(xl)) \
: "r" ((USItype)(a)), \
@ -296,9 +298,9 @@ extern UDItype __udiv_qrnnd ();
struct {USItype __h, __l;} __i; \
} __xx; \
__asm__ ("xmpyu %1,%2,%0" \
: "=x" (__xx.__ll) \
: "x" ((USItype)(u)), \
"x" ((USItype)(v))); \
: "=fx" (__xx.__ll) \
: "fx" ((USItype)(u)), \
"fx" ((USItype)(v))); \
(wh) = __xx.__i.__h; \
(wl) = __xx.__i.__l; \
} while (0)
@ -308,12 +310,14 @@ extern UDItype __udiv_qrnnd ();
#define UMUL_TIME 40
#define UDIV_TIME 80
#endif
#ifndef LONGLONG_STANDALONE
#define udiv_qrnnd(q, r, n1, n0, d) \
do { USItype __r; \
(q) = __udiv_qrnnd (&__r, (n1), (n0), (d)); \
(r) = __r; \
} while (0)
extern USItype __udiv_qrnnd ();
#endif /* LONGLONG_STANDALONE */
#define count_leading_zeros(count, x) \
do { \
USItype __tmp; \
@ -419,8 +423,12 @@ extern USItype __udiv_qrnnd ();
} while (0)
#define count_trailing_zeros(count, x) \
__asm__ ("bsfl %1,%0" : "=r" (count) : "rm" ((USItype)(x)))
#ifndef UMUL_TIME
#define UMUL_TIME 40
#endif
#ifndef UDIV_TIME
#define UDIV_TIME 40
#endif
#endif /* 80x86 */
#if defined (__i960__) && W_TYPE_SIZE == 32
@ -442,7 +450,7 @@ extern USItype __udiv_qrnnd ();
__w; })
#endif /* __i960__ */
#if defined (__mc68000__) && W_TYPE_SIZE == 32
#if (defined (__mc68000__) || defined (__mc68020__) || defined (__NeXT__) || defined(mc68020)) && W_TYPE_SIZE == 32
#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
__asm__ ("add%.l %5,%1
addx%.l %3,%0" \
@ -489,38 +497,34 @@ extern USItype __udiv_qrnnd ();
: "=d" ((USItype)(count)) \
: "od" ((USItype)(x)), "n" (0))
#else /* not mc68020 */
#define umul_ppmm(xh, xl, a, b) \
#define umul_ppmmxx(xh, xl, a, b) \
do { USItype __umul_tmp1, __umul_tmp2; \
__asm__ ("| Inlined umul_ppmm
move%.l %2,%/d0
move%.l %3,%/d1
move%.l %/d0,%/d2
swap %/d0
move%.l %/d1,%/d3
swap %/d1
move%.w %/d2,%/d4
mulu %/d3,%/d4
mulu %/d1,%/d2
mulu %/d0,%/d3
mulu %/d0,%/d1
move%.l %/d4,%/d0
eor%.w %/d0,%/d0
swap %/d0
add%.l %/d0,%/d2
add%.l %/d3,%/d2
move%.l %5,%3
move%.l %2,%0
move%.w %3,%1
swap %3
swap %0
mulu %2,%1
mulu %3,%0
mulu %2,%3
swap %2
mulu %5,%2
add%.l %3,%2
jcc 1f
add%.l #65536,%/d1
1: swap %/d2
moveq #0,%/d0
move%.w %/d2,%/d0
move%.w %/d4,%/d2
move%.l %/d2,%1
add%.l %/d1,%/d0
move%.l %/d0,%0" \
: "=g" ((USItype)(xh)), \
"=g" ((USItype)(xl)) \
: "g" ((USItype)(a)), \
"g" ((USItype)(b)) \
: "d0", "d1", "d2", "d3", "d4")
add%.l %#0x10000,%0
1: move%.l %2,%3
clr%.w %2
swap %2
swap %3
clr%.w %3
add%.l %3,%1
addx%.l %2,%0
| End inlined umul_ppmm" \
: "=&d" ((USItype)(xh)), "=&d" ((USItype)(xl)), \
"=d" (__umul_tmp1), "=&d" (__umul_tmp2) \
: "%2" ((USItype)(a)), "d" ((USItype)(b))); \
} while (0)
#define UMUL_TIME 100
#define UDIV_TIME 400
#endif /* not mc68020 */
@ -553,7 +557,7 @@ extern USItype __udiv_qrnnd ();
: "r" ((USItype)(x))); \
(count) = __cbtmp ^ 31; \
} while (0)
#if defined (__mc88110__)
#if defined (__m88110__)
#define umul_ppmm(wh, wl, u, v) \
do { \
union {UDItype __ll; \
@ -582,10 +586,18 @@ extern USItype __udiv_qrnnd ();
#else
#define UMUL_TIME 17
#define UDIV_TIME 150
#endif /* __mc88110__ */
#endif /* __m88110__ */
#endif /* __m88000__ */
#if defined (__mips__) && W_TYPE_SIZE == 32
#if __GNUC__ > 2 || __GNUC_MINOR__ >= 7
#define umul_ppmm(w1, w0, u, v) \
__asm__ ("multu %2,%3" \
: "=l" ((USItype)(w0)), \
"=h" ((USItype)(w1)) \
: "d" ((USItype)(u)), \
"d" ((USItype)(v)))
#else
#define umul_ppmm(w1, w0, u, v) \
__asm__ ("multu %2,%3
mflo %0
@ -594,11 +606,20 @@ extern USItype __udiv_qrnnd ();
"=d" ((USItype)(w1)) \
: "d" ((USItype)(u)), \
"d" ((USItype)(v)))
#endif
#define UMUL_TIME 10
#define UDIV_TIME 100
#endif /* __mips__ */
#if (defined (__mips) && __mips >= 3) && W_TYPE_SIZE == 64
#if __GNUC__ > 2 || __GNUC_MINOR__ >= 7
#define umul_ppmm(w1, w0, u, v) \
__asm__ ("dmultu %2,%3" \
: "=l" ((UDItype)(w0)), \
"=h" ((UDItype)(w1)) \
: "d" ((UDItype)(u)), \
"d" ((UDItype)(v)))
#else
#define umul_ppmm(w1, w0, u, v) \
__asm__ ("dmultu %2,%3
mflo %0
@ -607,8 +628,9 @@ extern USItype __udiv_qrnnd ();
"=d" ((UDItype)(w1)) \
: "d" ((UDItype)(u)), \
"d" ((UDItype)(v)))
#define UMUL_TIME 10
#define UDIV_TIME 100
#endif
#define UMUL_TIME 20
#define UDIV_TIME 140
#endif /* __mips__ */
#if defined (__ns32000__) && W_TYPE_SIZE == 32
@ -647,7 +669,7 @@ extern USItype __udiv_qrnnd ();
} while (0)
#endif /* __ns32000__ */
#if (defined (__powerpc__) || defined (___IBMR2__)) && W_TYPE_SIZE == 32
#if (defined (_ARCH_PPC) || defined (_IBMR2)) && W_TYPE_SIZE == 32
#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
do { \
if (__builtin_constant_p (bh) && (bh) == 0) \
@ -716,7 +738,7 @@ extern USItype __udiv_qrnnd ();
__asm__ ("{cntlz|cntlzw} %0,%1" \
: "=r" ((USItype)(count)) \
: "r" ((USItype)(x)))
#if defined (__powerpc__)
#if defined (_ARCH_PPC)
#define umul_ppmm(ph, pl, m0, m1) \
do { \
USItype __m0 = (m0), __m1 = (m1); \
@ -785,16 +807,15 @@ extern USItype __udiv_qrnnd ();
"g" ((USItype)(bh)), \
"1" ((USItype)(al)), \
"g" ((USItype)(bl)))
/* This insn doesn't work on ancient pyramids. */
/* This insn works on Pyramids with AP, XP, or MI CPUs, but not with SP. */
#define umul_ppmm(w1, w0, u, v) \
({union {UDItype __ll; \
struct {USItype __h, __l;} __i; \
} __xx; \
__xx.__i.__l = u; \
__asm__ ("uemul %3,%0" \
: "=r" (__xx.__i.__h), \
"=r" (__xx.__i.__l) \
: "1" (__xx.__i.__l), \
__asm__ ("movw %1,%R0
uemul %2,%0" \
: "=&r" (__xx.__ll) \
: "g" ((USItype) (u)), \
"g" ((USItype)(v))); \
(w1) = __xx.__i.__h; (w0) = __xx.__i.__l;})
#endif /* __pyr__ */
@ -868,6 +889,20 @@ extern USItype __udiv_qrnnd ();
} while (0)
#endif
#if defined (__sh2__) && W_TYPE_SIZE == 32
#define umul_ppmm(w1, w0, u, v) \
__asm__ ( \
"dmulu.l %2,%3
sts macl,%1
sts mach,%0" \
: "=r" ((USItype)(w1)), \
"=r" ((USItype)(w0)) \
: "r" ((USItype)(u)), \
"r" ((USItype)(v)) \
: "macl", "mach")
#define UMUL_TIME 5
#endif
#if defined (__sparc__) && W_TYPE_SIZE == 32
#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
__asm__ ("addcc %r4,%5,%1
@ -901,17 +936,21 @@ extern USItype __udiv_qrnnd ();
: "r" ((USItype)(u)), \
"r" ((USItype)(v)))
#define UMUL_TIME 5
/* We might want to leave this undefined for `SuperSPARC (tm)' since
its implementation is crippled and often traps. */
#ifndef SUPERSPARC /* SuperSPARC's udiv only handles 53 bit dividends */
#define udiv_qrnnd(q, r, n1, n0, d) \
__asm__ ("mov %2,%%y;nop;nop;nop;udiv %3,%4,%0;umul %0,%4,%1;sub %3,%1,%1"\
: "=&r" ((USItype)(q)), \
"=&r" ((USItype)(r)) \
do { \
USItype __q; \
__asm__ ("mov %1,%%y;nop;nop;nop;udiv %2,%3,%0" \
: "=r" ((USItype)(__q)) \
: "r" ((USItype)(n1)), \
"r" ((USItype)(n0)), \
"r" ((USItype)(d)))
"r" ((USItype)(d))); \
(r) = (n0) - __q * (d); \
(q) = __q; \
} while (0)
#define UDIV_TIME 25
#else
#endif /* SUPERSPARC */
#else /* ! __sparc_v8__ */
#if defined (__sparclite__)
/* This has hardware multiply but not divide. It also has two additional
instructions scan (ffs from high bit) and divscc. */
@ -973,9 +1012,10 @@ extern USItype __udiv_qrnnd ();
__asm__ ("scan %1,0,%0" \
: "=r" ((USItype)(x)) \
: "r" ((USItype)(count)))
#else
/* SPARC without integer multiplication and divide instructions.
(i.e. at least Sun4/20,40,60,65,75,110,260,280,330,360,380,470,490) */
#endif /* __sparclite__ */
#endif /* __sparc_v8__ */
/* Default to sparc v7 versions of umul_ppmm and udiv_qrnnd. */
#ifndef umul_ppmm
#define umul_ppmm(w1, w0, u, v) \
__asm__ ("! Inlined umul_ppmm
wr %%g0,%2,%%y ! SPARC has 0-3 delay insn after a wr
@ -1023,6 +1063,9 @@ extern USItype __udiv_qrnnd ();
"r" ((USItype)(v)) \
: "%g1", "%g2" __AND_CLOBBER_CC)
#define UMUL_TIME 39 /* 39 instructions */
#endif
#ifndef udiv_qrnnd
#ifndef LONGLONG_STANDALONE
#define udiv_qrnnd(q, r, n1, n0, d) \
do { USItype __r; \
(q) = __udiv_qrnnd (&__r, (n1), (n0), (d)); \
@ -1030,8 +1073,8 @@ extern USItype __udiv_qrnnd ();
} while (0)
extern USItype __udiv_qrnnd ();
#define UDIV_TIME 140
#endif /* __sparclite__ */
#endif /* __sparc_v8__ */
#endif /* LONGLONG_STANDALONE */
#endif /* udiv_qrnnd */
#endif /* __sparc__ */
#if defined (__vax__) && W_TYPE_SIZE == 32
@ -1075,7 +1118,7 @@ extern USItype __udiv_qrnnd ();
__xx.__i.__h = n1; __xx.__i.__l = n0; \
__asm__ ("ediv %3,%2,%0,%1" \
: "=g" (q), "=g" (r) \
: "g" (__n1n0.ll), "g" (d)); \
: "g" (__xx.ll), "g" (d)); \
} while (0)
#endif /* __vax__ */
@ -1173,11 +1216,12 @@ extern USItype __udiv_qrnnd ();
do { \
UWtype __x0, __x1, __x2, __x3; \
UHWtype __ul, __vl, __uh, __vh; \
UWtype __u = (u), __v = (v); \
\
__ul = __ll_lowpart (u); \
__uh = __ll_highpart (u); \
__vl = __ll_lowpart (v); \
__vh = __ll_highpart (v); \
__ul = __ll_lowpart (__u); \
__uh = __ll_highpart (__u); \
__vl = __ll_lowpart (__v); \
__vh = __ll_highpart (__v); \
\
__x0 = (UWtype) __ul * __vl; \
__x1 = (UWtype) __ul * __vh; \
@ -1194,6 +1238,17 @@ extern USItype __udiv_qrnnd ();
} while (0)
#endif
#if !defined (umul_ppmm)
#define smul_ppmm(w1, w0, u, v) \
do { \
UWtype __w1; \
UWtype __m0 = (u), __m1 = (v); \
umul_ppmm (__w1, w0, __m0, __m1); \
(w1) = __w1 - (-(__m0 >> (W_TYPE_SIZE - 1)) & __m1) \
- (-(__m1 >> (W_TYPE_SIZE - 1)) & __m0); \
} while (0)
#endif
/* Define this unconditionally, so it can be used for debugging. */
#define __udiv_qrnnd_c(q, r, n1, n0, d) \
do { \

119
sysdeps/alpha/add_n.s Normal file
View File

@ -0,0 +1,119 @@
# Alpha __mpn_add_n -- Add two limb vectors of the same length > 0 and
# store sum in a third limb vector.
# Copyright (C) 1995 Free Software Foundation, Inc.
# This file is part of the GNU MP Library.
# The GNU MP Library is free software; you can redistribute it and/or modify
# it under the terms of the GNU Library General Public License as published by
# the Free Software Foundation; either version 2 of the License, or (at your
# option) any later version.
# The GNU MP Library is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
# License for more details.
# You should have received a copy of the GNU Library General Public License
# along with the GNU MP Library; see the file COPYING.LIB. If not, write to
# the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
# INPUT PARAMETERS
# res_ptr $16
# s1_ptr $17
# s2_ptr $18
# size $19
.set noreorder
.set noat
.text
.align 3
.globl __mpn_add_n
.ent __mpn_add_n
__mpn_add_n:
.frame $30,0,$26,0
ldq $3,0($17)
ldq $4,0($18)
subq $19,1,$19
and $19,4-1,$2 # number of limbs in first loop
bis $31,$31,$0
beq $2,.L0 # if multiple of 4 limbs, skip first loop
subq $19,$2,$19
.Loop0: subq $2,1,$2
ldq $5,8($17)
addq $4,$0,$4
ldq $6,8($18)
cmpult $4,$0,$1
addq $3,$4,$4
cmpult $4,$3,$0
stq $4,0($16)
or $0,$1,$0
addq $17,8,$17
addq $18,8,$18
bis $5,$5,$3
bis $6,$6,$4
addq $16,8,$16
bne $2,.Loop0
.L0: beq $19,.Lend
.align 3
.Loop: subq $19,4,$19
ldq $5,8($17)
addq $4,$0,$4
ldq $6,8($18)
cmpult $4,$0,$1
addq $3,$4,$4
cmpult $4,$3,$0
stq $4,0($16)
or $0,$1,$0
ldq $3,16($17)
addq $6,$0,$6
ldq $4,16($18)
cmpult $6,$0,$1
addq $5,$6,$6
cmpult $6,$5,$0
stq $6,8($16)
or $0,$1,$0
ldq $5,24($17)
addq $4,$0,$4
ldq $6,24($18)
cmpult $4,$0,$1
addq $3,$4,$4
cmpult $4,$3,$0
stq $4,16($16)
or $0,$1,$0
ldq $3,32($17)
addq $6,$0,$6
ldq $4,32($18)
cmpult $6,$0,$1
addq $5,$6,$6
cmpult $6,$5,$0
stq $6,24($16)
or $0,$1,$0
addq $17,32,$17
addq $18,32,$18
addq $16,32,$16
bne $19,.Loop
.Lend: addq $4,$0,$4
cmpult $4,$0,$1
addq $3,$4,$4
cmpult $4,$3,$0
stq $4,0($16)
or $0,$1,$0
ret $31,($26),1
.end __mpn_add_n

100
sysdeps/alpha/addmul_1.s Normal file
View File

@ -0,0 +1,100 @@
# Alpha 21064 __mpn_addmul_1 -- Multiply a limb vector with a limb and add
# the result to a second limb vector.
# Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc.
# This file is part of the GNU MP Library.
# The GNU MP Library is free software; you can redistribute it and/or modify
# it under the terms of the GNU Library General Public License as published by
# the Free Software Foundation; either version 2 of the License, or (at your
# option) any later version.
# The GNU MP Library is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
# License for more details.
# You should have received a copy of the GNU Library General Public License
# along with the GNU MP Library; see the file COPYING.LIB. If not, write to
# the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
# INPUT PARAMETERS
# res_ptr r16
# s1_ptr r17
# size r18
# s2_limb r19
# This code runs at 42 cycles/limb on the 21064.
# To improve performance for long multiplications, we would use
# 'fetch' for S1 and 'fetch_m' for RES. It's not obvious how to use
# these instructions without slowing down the general code: 1. We can
# only have two prefetches in operation at any time in the Alpha
# architecture. 2. There will seldom be any special alignment
# between RES_PTR and S1_PTR. Maybe we can simply divide the current
# loop into an inner and outer loop, having the inner loop handle
# exactly one prefetch block?
.set noreorder
.set noat
.text
.align 3
.globl __mpn_addmul_1
.ent __mpn_addmul_1 2
__mpn_addmul_1:
.frame $30,0,$26
ldq $2,0($17) # $2 = s1_limb
addq $17,8,$17 # s1_ptr++
subq $18,1,$18 # size--
mulq $2,$19,$3 # $3 = prod_low
ldq $5,0($16) # $5 = *res_ptr
umulh $2,$19,$0 # $0 = prod_high
beq $18,Lend1 # jump if size was == 1
ldq $2,0($17) # $2 = s1_limb
addq $17,8,$17 # s1_ptr++
subq $18,1,$18 # size--
addq $5,$3,$3
cmpult $3,$5,$4
stq $3,0($16)
addq $16,8,$16 # res_ptr++
beq $18,Lend2 # jump if size was == 2
.align 3
Loop: mulq $2,$19,$3 # $3 = prod_low
ldq $5,0($16) # $5 = *res_ptr
addq $4,$0,$0 # cy_limb = cy_limb + 'cy'
subq $18,1,$18 # size--
umulh $2,$19,$4 # $4 = cy_limb
ldq $2,0($17) # $2 = s1_limb
addq $17,8,$17 # s1_ptr++
addq $3,$0,$3 # $3 = cy_limb + prod_low
cmpult $3,$0,$0 # $0 = carry from (cy_limb + prod_low)
addq $5,$3,$3
cmpult $3,$5,$5
stq $3,0($16)
addq $16,8,$16 # res_ptr++
addq $5,$0,$0 # combine carries
bne $18,Loop
Lend2: mulq $2,$19,$3 # $3 = prod_low
ldq $5,0($16) # $5 = *res_ptr
addq $4,$0,$0 # cy_limb = cy_limb + 'cy'
umulh $2,$19,$4 # $4 = cy_limb
addq $3,$0,$3 # $3 = cy_limb + prod_low
cmpult $3,$0,$0 # $0 = carry from (cy_limb + prod_low)
addq $5,$3,$3
cmpult $3,$5,$5
stq $3,0($16)
addq $5,$0,$0 # combine carries
addq $4,$0,$0 # cy_limb = prod_high + cy
ret $31,($26),1
Lend1: addq $5,$3,$3
cmpult $3,$5,$5
stq $3,0($16)
addq $0,$5,$0
ret $31,($26),1
.end __mpn_addmul_1

View File

@ -0,0 +1,118 @@
# Alpha __mpn_add_n -- Add two limb vectors of the same length > 0 and
# store sum in a third limb vector.
# Copyright (C) 1995 Free Software Foundation, Inc.
# This file is part of the GNU MP Library.
# The GNU MP Library is free software; you can redistribute it and/or modify
# it under the terms of the GNU Library General Public License as published by
# the Free Software Foundation; either version 2 of the License, or (at your
# option) any later version.
# The GNU MP Library is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
# License for more details.
# You should have received a copy of the GNU Library General Public License
# along with the GNU MP Library; see the file COPYING.LIB. If not, write to
# the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
# INPUT PARAMETERS
# res_ptr $16
# s1_ptr $17
# s2_ptr $18
# size $19
.set noreorder
.set noat
.text
.align 3
.globl __mpn_add_n
.ent __mpn_add_n
__mpn_add_n:
.frame $30,0,$26,0
ldq $3,0($17)
ldq $4,0($18)
subq $19,1,$19
and $19,4-1,$2 # number of limbs in first loop
bis $31,$31,$0
beq $2,.L0 # if multiple of 4 limbs, skip first loop
subq $19,$2,$19
.Loop0: subq $2,1,$2
ldq $5,8($17)
addq $4,$0,$4
ldq $6,8($18)
cmpult $4,$0,$1
addq $3,$4,$4
cmpult $4,$3,$0
stq $4,0($16)
or $0,$1,$0
addq $17,8,$17
addq $18,8,$18
bis $5,$5,$3
bis $6,$6,$4
addq $16,8,$16
bne $2,.Loop0
.L0: beq $19,.Lend
.align 4
.Loop: subq $19,4,$19
unop
ldq $6,8($18)
addq $4,$0,$0
ldq $5,8($17)
cmpult $0,$4,$1
ldq $4,16($18)
addq $3,$0,$20
cmpult $20,$3,$0
ldq $3,16($17)
or $0,$1,$0
addq $6,$0,$0
cmpult $0,$6,$1
ldq $6,24($18)
addq $5,$0,$21
cmpult $21,$5,$0
ldq $5,24($17)
or $0,$1,$0
addq $4,$0,$0
cmpult $0,$4,$1
ldq $4,32($18)
addq $3,$0,$22
cmpult $22,$3,$0
ldq $3,32($17)
or $0,$1,$0
addq $6,$0,$0
cmpult $0,$6,$1
addq $5,$0,$23
cmpult $23,$5,$0
or $0,$1,$0
stq $20,0($16)
stq $21,8($16)
stq $22,16($16)
stq $23,24($16)
addq $17,32,$17
addq $18,32,$18
addq $16,32,$16
bne $19,.Loop
.Lend: addq $4,$0,$4
cmpult $4,$0,$1
addq $3,$4,$4
cmpult $4,$3,$0
stq $4,0($16)
or $0,$1,$0
ret $31,($26),1
.end __mpn_add_n

View File

@ -0,0 +1,175 @@
# Alpha EV5 __mpn_lshift --
# Copyright (C) 1994, 1995 Free Software Foundation, Inc.
# This file is part of the GNU MP Library.
# The GNU MP Library is free software; you can redistribute it and/or modify
# it under the terms of the GNU Library General Public License as published by
# the Free Software Foundation; either version 2 of the License, or (at your
# option) any later version.
# The GNU MP Library is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
# License for more details.
# You should have received a copy of the GNU Library General Public License
# along with the GNU MP Library; see the file COPYING.LIB. If not, write to
# the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
# INPUT PARAMETERS
# res_ptr r16
# s1_ptr r17
# size r18
# cnt r19
# This code runs at 4.25 cycles/limb on the EV5.
.set noreorder
.set noat
.text
.align 3
.globl __mpn_lshift
.ent __mpn_lshift
__mpn_lshift:
.frame $30,0,$26,0
s8addq $18,$17,$17 # make r17 point at end of s1
ldq $4,-8($17) # load first limb
subq $31,$19,$20
s8addq $18,$16,$16 # make r16 point at end of RES
subq $18,1,$18
and $18,4-1,$28 # number of limbs in first loop
srl $4,$20,$0 # compute function result
beq $28,L0
subq $18,$28,$18
.align 3
Loop0: ldq $3,-16($17)
subq $16,8,$16
sll $4,$19,$5
subq $17,8,$17
subq $28,1,$28
srl $3,$20,$6
or $3,$3,$4
or $5,$6,$8
stq $8,0($16)
bne $28,Loop0
L0: sll $4,$19,$24
beq $18,Lend
# warm up phase 1
ldq $1,-16($17)
subq $18,4,$18
ldq $2,-24($17)
ldq $3,-32($17)
ldq $4,-40($17)
beq $18,Lcool1
# warm up phase 2
srl $1,$20,$7
sll $1,$19,$21
srl $2,$20,$8
ldq $1,-48($17)
sll $2,$19,$22
ldq $2,-56($17)
srl $3,$20,$5
or $7,$24,$7
sll $3,$19,$23
or $8,$21,$8
srl $4,$20,$6
ldq $3,-64($17)
sll $4,$19,$24
ldq $4,-72($17)
subq $18,4,$18
beq $18,Lcool1
.align 4
# main loop
Loop: stq $7,-8($16)
or $5,$22,$5
stq $8,-16($16)
or $6,$23,$6
srl $1,$20,$7
subq $18,4,$18
sll $1,$19,$21
unop # ldq $31,-96($17)
srl $2,$20,$8
ldq $1,-80($17)
sll $2,$19,$22
ldq $2,-88($17)
stq $5,-24($16)
or $7,$24,$7
stq $6,-32($16)
or $8,$21,$8
srl $3,$20,$5
unop # ldq $31,-96($17)
sll $3,$19,$23
subq $16,32,$16
srl $4,$20,$6
ldq $3,-96($17
sll $4,$19,$24
ldq $4,-104($17)
subq $17,32,$17
bne $18,Loop
unop
unop
# cool down phase 2/1
Lcool1: stq $7,-8($16)
or $5,$22,$5
stq $8,-16($16)
or $6,$23,$6
srl $1,$20,$7
sll $1,$19,$21
srl $2,$20,$8
sll $2,$19,$22
stq $5,-24($16)
or $7,$24,$7
stq $6,-32($16)
or $8,$21,$8
srl $3,$20,$5
sll $3,$19,$23
srl $4,$20,$6
sll $4,$19,$24
# cool down phase 2/2
stq $7,-40($16)
or $5,$22,$5
stq $8,-48($16)
or $6,$23,$6
stq $5,-56($16)
stq $6,-64($16)
# cool down phase 2/3
stq $24,-72($16)
ret $31,($26),1
# cool down phase 1/1
Lcool1: srl $1,$20,$7
sll $1,$19,$21
srl $2,$20,$8
sll $2,$19,$22
srl $3,$20,$5
or $7,$24,$7
sll $3,$19,$23
or $8,$21,$8
srl $4,$20,$6
sll $4,$19,$24
# cool down phase 1/2
stq $7,-8($16)
or $5,$22,$5
stq $8,-16($16)
or $6,$23,$6
stq $5,-24($16)
stq $6,-32($16)
stq $24,-40($16)
ret $31,($26),1
Lend stq $24,-8($16)
ret $31,($26),1
.end __mpn_lshift

View File

@ -0,0 +1,173 @@
# Alpha EV5 __mpn_rshift --
# Copyright (C) 1994, 1995 Free Software Foundation, Inc.
# This file is part of the GNU MP Library.
# The GNU MP Library is free software; you can redistribute it and/or modify
# it under the terms of the GNU Library General Public License as published by
# the Free Software Foundation; either version 2 of the License, or (at your
# option) any later version.
# The GNU MP Library is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
# License for more details.
# You should have received a copy of the GNU Library General Public License
# along with the GNU MP Library; see the file COPYING.LIB. If not, write to
# the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
# INPUT PARAMETERS
# res_ptr r16
# s1_ptr r17
# size r18
# cnt r19
# This code runs at 4.25 cycles/limb on the EV5.
.set noreorder
.set noat
.text
.align 3
.globl __mpn_rshift
.ent __mpn_rshift
__mpn_rshift:
.frame $30,0,$26,0
ldq $4,0($17) # load first limb
subq $31,$19,$20
subq $18,1,$18
and $18,4-1,$28 # number of limbs in first loop
sll $4,$20,$0 # compute function result
beq $28,L0
subq $18,$28,$18
.align 3
Loop0: ldq $3,8($17)
addq $16,8,$16
srl $4,$19,$5
addq $17,8,$17
subq $28,1,$28
sll $3,$20,$6
or $3,$3,$4
or $5,$6,$8
stq $8,-8($16)
bne $28,Loop0
L0: srl $4,$19,$24
beq $18,Lend
# warm up phase 1
ldq $1,8($17)
subq $18,4,$18
ldq $2,16($17)
ldq $3,24($17)
ldq $4,32($17)
beq $18,Lcool1
# warm up phase 2
sll $1,$20,$7
srl $1,$19,$21
sll $2,$20,$8
ldq $1,40($17)
srl $2,$19,$22
ldq $2,48($17)
sll $3,$20,$5
or $7,$24,$7
srl $3,$19,$23
or $8,$21,$8
sll $4,$20,$6
ldq $3,56($17)
srl $4,$19,$24
ldq $4,64($17)
subq $18,4,$18
beq $18,Lcool2
.align 4
# main loop
Loop: stq $7,0($16)
or $5,$22,$5
stq $8,8($16)
or $6,$23,$6
sll $1,$20,$7
subq $18,4,$18
srl $1,$19,$21
unop # ldq $31,-96($17)
sll $2,$20,$8
ldq $1,72($17)
srl $2,$19,$22
ldq $2,80($17)
stq $5,16($16)
or $7,$24,$7
stq $6,24($16)
or $8,$21,$8
sll $3,$20,$5
unop # ldq $31,-96($17)
srl $3,$19,$23
addq $16,32,$16
sll $4,$20,$6
ldq $3,88($17)
srl $4,$19,$24
ldq $4,96($17)
addq $17,32,$17
bne $18,Loop
unop
unop
# cool down phase 2/1
Lcool2: stq $7,0($16)
or $5,$22,$5
stq $8,8($16)
or $6,$23,$6
sll $1,$20,$7
srl $1,$19,$21
sll $2,$20,$8
srl $2,$19,$22
stq $5,16($16)
or $7,$24,$7
stq $6,24($16)
or $8,$21,$8
sll $3,$20,$5
srl $3,$19,$23
sll $4,$20,$6
srl $4,$19,$24
# cool down phase 2/2
stq $7,32($16)
or $5,$22,$5
stq $8,40($16)
or $6,$23,$6
stq $5,48($16)
stq $6,56($16)
# cool down phase 2/3
stq $24,64($16)
ret $31,($26),1
# cool down phase 1/1
Lcool1: sll $1,$20,$7
srl $1,$19,$21
sll $2,$20,$8
srl $2,$19,$22
sll $3,$20,$5
or $7,$24,$7
srl $3,$19,$23
or $8,$21,$8
sll $4,$20,$6
srl $4,$19,$24
# cool down phase 1/2
stq $7,0($16)
or $5,$22,$5
stq $8,8($16)
or $6,$23,$6
stq $5,16($16)
stq $6,24($16)
stq $24,32($16)
ret $31,($26),1
Lend: stq $24,0($16)
ret $31,($26),1
.end __mpn_rshift

108
sysdeps/alpha/lshift.s Normal file
View File

@ -0,0 +1,108 @@
# Alpha 21064 __mpn_lshift --
# Copyright (C) 1994, 1995 Free Software Foundation, Inc.
# This file is part of the GNU MP Library.
# The GNU MP Library is free software; you can redistribute it and/or modify
# it under the terms of the GNU Library General Public License as published by
# the Free Software Foundation; either version 2 of the License, or (at your
# option) any later version.
# The GNU MP Library is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
# License for more details.
# You should have received a copy of the GNU Library General Public License
# along with the GNU MP Library; see the file COPYING.LIB. If not, write to
# the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
# INPUT PARAMETERS
# res_ptr r16
# s1_ptr r17
# size r18
# cnt r19
# This code runs at 4.8 cycles/limb on the 21064. With infinite unrolling,
# it would take 4 cycles/limb. It should be possible to get down to 3
# cycles/limb since both ldq and stq can be paired with the other used
# instructions. But there are many restrictions in the 21064 pipeline that
# makes it hard, if not impossible, to get down to 3 cycles/limb:
# 1. ldq has a 3 cycle delay, srl and sll have a 2 cycle delay.
# 2. Only aligned instruction pairs can be paired.
# 3. The store buffer or silo might not be able to deal with the bandwidth.
.set noreorder
.set noat
.text
.align 3
.globl __mpn_lshift
.ent __mpn_lshift
__mpn_lshift:
.frame $30,0,$26,0
s8addq $18,$17,$17 # make r17 point at end of s1
ldq $4,-8($17) # load first limb
subq $17,8,$17
subq $31,$19,$7
s8addq $18,$16,$16 # make r16 point at end of RES
subq $18,1,$18
and $18,4-1,$20 # number of limbs in first loop
srl $4,$7,$0 # compute function result
beq $20,L0
subq $18,$20,$18
.align 3
Loop0:
ldq $3,-8($17)
subq $16,8,$16
subq $17,8,$17
subq $20,1,$20
sll $4,$19,$5
srl $3,$7,$6
bis $3,$3,$4
bis $5,$6,$8
stq $8,0($16)
bne $20,Loop0
L0: beq $18,Lend
.align 3
Loop: ldq $3,-8($17)
subq $16,32,$16
subq $18,4,$18
sll $4,$19,$5
srl $3,$7,$6
ldq $4,-16($17)
sll $3,$19,$1
bis $5,$6,$8
stq $8,24($16)
srl $4,$7,$2
ldq $3,-24($17)
sll $4,$19,$5
bis $1,$2,$8
stq $8,16($16)
srl $3,$7,$6
ldq $4,-32($17)
sll $3,$19,$1
bis $5,$6,$8
stq $8,8($16)
srl $4,$7,$2
subq $17,32,$17
bis $1,$2,$8
stq $8,0($16)
bgt $18,Loop
Lend: sll $4,$19,$8
stq $8,-8($16)
ret $31,($26),1
.end __mpn_lshift

84
sysdeps/alpha/mul_1.s Normal file
View File

@ -0,0 +1,84 @@
# Alpha 21064 __mpn_mul_1 -- Multiply a limb vector with a limb and store
# the result in a second limb vector.
# Copyright (C) 1992, 1994 Free Software Foundation, Inc.
# This file is part of the GNU MP Library.
# The GNU MP Library is free software; you can redistribute it and/or modify
# it under the terms of the GNU Library General Public License as published by
# the Free Software Foundation; either version 2 of the License, or (at your
# option) any later version.
# The GNU MP Library is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
# License for more details.
# You should have received a copy of the GNU Library General Public License
# along with the GNU MP Library; see the file COPYING.LIB. If not, write to
# the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
# INPUT PARAMETERS
# res_ptr r16
# s1_ptr r17
# size r18
# s2_limb r19
# This code runs at 42 cycles/limb on the EV4 and 18 cycles/limb on the EV5.
# To improve performance for long multiplications, we would use
# 'fetch' for S1 and 'fetch_m' for RES. It's not obvious how to use
# these instructions without slowing down the general code: 1. We can
# only have two prefetches in operation at any time in the Alpha
# architecture. 2. There will seldom be any special alignment
# between RES_PTR and S1_PTR. Maybe we can simply divide the current
# loop into an inner and outer loop, having the inner loop handle
# exactly one prefetch block?
.set noreorder
.set noat
.text
.align 3
.globl __mpn_mul_1
.ent __mpn_mul_1 2
__mpn_mul_1:
.frame $30,0,$26
ldq $2,0($17) # $2 = s1_limb
subq $18,1,$18 # size--
mulq $2,$19,$3 # $3 = prod_low
bic $31,$31,$4 # clear cy_limb
umulh $2,$19,$0 # $0 = prod_high
beq $18,Lend1 # jump if size was == 1
ldq $2,8($17) # $2 = s1_limb
subq $18,1,$18 # size--
stq $3,0($16)
beq $18,Lend2 # jump if size was == 2
.align 3
Loop: mulq $2,$19,$3 # $3 = prod_low
addq $4,$0,$0 # cy_limb = cy_limb + 'cy'
subq $18,1,$18 # size--
umulh $2,$19,$4 # $4 = cy_limb
ldq $2,16($17) # $2 = s1_limb
addq $17,8,$17 # s1_ptr++
addq $3,$0,$3 # $3 = cy_limb + prod_low
stq $3,8($16)
cmpult $3,$0,$0 # $0 = carry from (cy_limb + prod_low)
addq $16,8,$16 # res_ptr++
bne $18,Loop
Lend2: mulq $2,$19,$3 # $3 = prod_low
addq $4,$0,$0 # cy_limb = cy_limb + 'cy'
umulh $2,$19,$4 # $4 = cy_limb
addq $3,$0,$3 # $3 = cy_limb + prod_low
cmpult $3,$0,$0 # $0 = carry from (cy_limb + prod_low)
stq $3,8($16)
addq $4,$0,$0 # cy_limb = prod_high + cy
ret $31,($26),1
Lend1: stq $3,0($16)
ret $31,($26),1
.end __mpn_mul_1

106
sysdeps/alpha/rshift.s Normal file
View File

@ -0,0 +1,106 @@
# Alpha 21064 __mpn_rshift --
# Copyright (C) 1994, 1995 Free Software Foundation, Inc.
# This file is part of the GNU MP Library.
# The GNU MP Library is free software; you can redistribute it and/or modify
# it under the terms of the GNU Library General Public License as published by
# the Free Software Foundation; either version 2 of the License, or (at your
# option) any later version.
# The GNU MP Library is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
# License for more details.
# You should have received a copy of the GNU Library General Public License
# along with the GNU MP Library; see the file COPYING.LIB. If not, write to
# the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
# INPUT PARAMETERS
# res_ptr r16
# s1_ptr r17
# size r18
# cnt r19
# This code runs at 4.8 cycles/limb on the 21064. With infinite unrolling,
# it would take 4 cycles/limb. It should be possible to get down to 3
# cycles/limb since both ldq and stq can be paired with the other used
# instructions. But there are many restrictions in the 21064 pipeline that
# makes it hard, if not impossible, to get down to 3 cycles/limb:
# 1. ldq has a 3 cycle delay, srl and sll have a 2 cycle delay.
# 2. Only aligned instruction pairs can be paired.
# 3. The store buffer or silo might not be able to deal with the bandwidth.
.set noreorder
.set noat
.text
.align 3
.globl __mpn_rshift
.ent __mpn_rshift
__mpn_rshift:
.frame $30,0,$26,0
ldq $4,0($17) # load first limb
addq $17,8,$17
subq $31,$19,$7
subq $18,1,$18
and $18,4-1,$20 # number of limbs in first loop
sll $4,$7,$0 # compute function result
beq $20,L0
subq $18,$20,$18
.align 3
Loop0:
ldq $3,0($17)
addq $16,8,$16
addq $17,8,$17
subq $20,1,$20
srl $4,$19,$5
sll $3,$7,$6
bis $3,$3,$4
bis $5,$6,$8
stq $8,-8($16)
bne $20,Loop0
L0: beq $18,Lend
.align 3
Loop: ldq $3,0($17)
addq $16,32,$16
subq $18,4,$18
srl $4,$19,$5
sll $3,$7,$6
ldq $4,8($17)
srl $3,$19,$1
bis $5,$6,$8
stq $8,-32($16)
sll $4,$7,$2
ldq $3,16($17)
srl $4,$19,$5
bis $1,$2,$8
stq $8,-24($16)
sll $3,$7,$6
ldq $4,24($17)
srl $3,$19,$1
bis $5,$6,$8
stq $8,-16($16)
sll $4,$7,$2
addq $17,32,$17
bis $1,$2,$8
stq $8,-8($16)
bgt $18,Loop
Lend: srl $4,$19,$8
stq $8,0($16)
ret $31,($26),1
.end __mpn_rshift

119
sysdeps/alpha/sub_n.s Normal file
View File

@ -0,0 +1,119 @@
# Alpha __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
# store difference in a third limb vector.
# Copyright (C) 1995 Free Software Foundation, Inc.
# This file is part of the GNU MP Library.
# The GNU MP Library is free software; you can redistribute it and/or modify
# it under the terms of the GNU Library General Public License as published by
# the Free Software Foundation; either version 2 of the License, or (at your
# option) any later version.
# The GNU MP Library is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
# License for more details.
# You should have received a copy of the GNU Library General Public License
# along with the GNU MP Library; see the file COPYING.LIB. If not, write to
# the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
# INPUT PARAMETERS
# res_ptr $16
# s1_ptr $17
# s2_ptr $18
# size $19
.set noreorder
.set noat
.text
.align 3
.globl __mpn_sub_n
.ent __mpn_sub_n
__mpn_sub_n:
.frame $30,0,$26,0
ldq $3,0($17)
ldq $4,0($18)
subq $19,1,$19
and $19,4-1,$2 # number of limbs in first loop
bis $31,$31,$0
beq $2,.L0 # if multiple of 4 limbs, skip first loop
subq $19,$2,$19
.Loop0: subq $2,1,$2
ldq $5,8($17)
addq $4,$0,$4
ldq $6,8($18)
cmpult $4,$0,$1
subq $3,$4,$4
cmpult $3,$4,$0
stq $4,0($16)
or $0,$1,$0
addq $17,8,$17
addq $18,8,$18
bis $5,$5,$3
bis $6,$6,$4
addq $16,8,$16
bne $2,.Loop0
.L0: beq $19,.Lend
.align 3
.Loop: subq $19,4,$19
ldq $5,8($17)
addq $4,$0,$4
ldq $6,8($18)
cmpult $4,$0,$1
subq $3,$4,$4
cmpult $3,$4,$0
stq $4,0($16)
or $0,$1,$0
ldq $3,16($17)
addq $6,$0,$6
ldq $4,16($18)
cmpult $6,$0,$1
subq $5,$6,$6
cmpult $5,$6,$0
stq $6,8($16)
or $0,$1,$0
ldq $5,24($17)
addq $4,$0,$4
ldq $6,24($18)
cmpult $4,$0,$1
subq $3,$4,$4
cmpult $3,$4,$0
stq $4,16($16)
or $0,$1,$0
ldq $3,32($17)
addq $6,$0,$6
ldq $4,32($18)
cmpult $6,$0,$1
subq $5,$6,$6
cmpult $5,$6,$0
stq $6,24($16)
or $0,$1,$0
addq $17,32,$17
addq $18,32,$18
addq $16,32,$16
bne $19,.Loop
.Lend: addq $4,$0,$4
cmpult $4,$0,$1
subq $3,$4,$4
cmpult $3,$4,$0
stq $4,0($16)
or $0,$1,$0
ret $31,($26),1
.end __mpn_sub_n

100
sysdeps/alpha/submul_1.s Normal file
View File

@ -0,0 +1,100 @@
# Alpha 21064 __mpn_submul_1 -- Multiply a limb vector with a limb and
# subtract the result from a second limb vector.
# Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc.
# This file is part of the GNU MP Library.
# The GNU MP Library is free software; you can redistribute it and/or modify
# it under the terms of the GNU Library General Public License as published by
# the Free Software Foundation; either version 2 of the License, or (at your
# option) any later version.
# The GNU MP Library is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
# License for more details.
# You should have received a copy of the GNU Library General Public License
# along with the GNU MP Library; see the file COPYING.LIB. If not, write to
# the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
# INPUT PARAMETERS
# res_ptr r16
# s1_ptr r17
# size r18
# s2_limb r19
# This code runs at 42 cycles/limb on the 21064.
# To improve performance for long multiplications, we would use
# 'fetch' for S1 and 'fetch_m' for RES. It's not obvious how to use
# these instructions without slowing down the general code: 1. We can
# only have two prefetches in operation at any time in the Alpha
# architecture. 2. There will seldom be any special alignment
# between RES_PTR and S1_PTR. Maybe we can simply divide the current
# loop into an inner and outer loop, having the inner loop handle
# exactly one prefetch block?
.set noreorder
.set noat
.text
.align 3
.globl __mpn_submul_1
.ent __mpn_submul_1 2
__mpn_submul_1:
.frame $30,0,$26
ldq $2,0($17) # $2 = s1_limb
addq $17,8,$17 # s1_ptr++
subq $18,1,$18 # size--
mulq $2,$19,$3 # $3 = prod_low
ldq $5,0($16) # $5 = *res_ptr
umulh $2,$19,$0 # $0 = prod_high
beq $18,Lend1 # jump if size was == 1
ldq $2,0($17) # $2 = s1_limb
addq $17,8,$17 # s1_ptr++
subq $18,1,$18 # size--
subq $5,$3,$3
cmpult $5,$3,$4
stq $3,0($16)
addq $16,8,$16 # res_ptr++
beq $18,Lend2 # jump if size was == 2
.align 3
Loop: mulq $2,$19,$3 # $3 = prod_low
ldq $5,0($16) # $5 = *res_ptr
addq $4,$0,$0 # cy_limb = cy_limb + 'cy'
subq $18,1,$18 # size--
umulh $2,$19,$4 # $4 = cy_limb
ldq $2,0($17) # $2 = s1_limb
addq $17,8,$17 # s1_ptr++
addq $3,$0,$3 # $3 = cy_limb + prod_low
cmpult $3,$0,$0 # $0 = carry from (cy_limb + prod_low)
subq $5,$3,$3
cmpult $5,$3,$5
stq $3,0($16)
addq $16,8,$16 # res_ptr++
addq $5,$0,$0 # combine carries
bne $18,Loop
Lend2: mulq $2,$19,$3 # $3 = prod_low
ldq $5,0($16) # $5 = *res_ptr
addq $4,$0,$0 # cy_limb = cy_limb + 'cy'
umulh $2,$19,$4 # $4 = cy_limb
addq $3,$0,$3 # $3 = cy_limb + prod_low
cmpult $3,$0,$0 # $0 = carry from (cy_limb + prod_low)
subq $5,$3,$3
cmpult $5,$3,$5
stq $3,0($16)
addq $5,$0,$0 # combine carries
addq $4,$0,$0 # cy_limb = prod_high + cy
ret $31,($26),1
Lend1: subq $5,$3,$3
cmpult $5,$3,$5
stq $3,0($16)
addq $0,$5,$0
ret $31,($26),1
.end __mpn_submul_1

View File

@ -134,7 +134,7 @@ Loop2: cmplt n0,0,tmp
ret $31,($26),1
Odd:
/* q' in n0. r' in n1. */
/* q' in n0. r' in n1 */
addq n1,n0,n1
cmpult n1,n0,tmp # tmp := carry from addq
beq tmp,LLp6

View File

@ -83,12 +83,10 @@ __mpn_divmod_1 (quot_ptr, dividend_ptr, dividend_size, divisor_limb)
result is a (N+1)-bit approximation to 1/DIVISOR_LIMB, with the
most significant bit (with weight 2**N) implicit. */
#if 0 /* This can't happen when normalization_steps != 0 */
/* Special case for DIVISOR_LIMB == 100...000. */
if (divisor_limb << 1 == 0)
divisor_limb_inverted = ~(mp_limb) 0;
else
#endif
udiv_qrnnd (divisor_limb_inverted, dummy,
-divisor_limb, 0, divisor_limb);

View File

@ -3,8 +3,6 @@
Return the single-limb remainder.
There are no constraints on the value of the divisor.
QUOT_PTR and DIVIDEND_PTR might point to the same limb.
Copyright (C) 1991, 1993, 1994, Free Software Foundation, Inc.
This file is part of the GNU MP Library.

57
sysdeps/hppa/add_n.s Normal file
View File

@ -0,0 +1,57 @@
; HP-PA __mpn_add_n -- Add two limb vectors of the same length > 0 and store
; sum in a third limb vector.
; Copyright (C) 1992, 1994 Free Software Foundation, Inc.
; This file is part of the GNU MP Library.
; The GNU MP Library is free software; you can redistribute it and/or modify
; it under the terms of the GNU Library General Public License as published by
; the Free Software Foundation; either version 2 of the License, or (at your
; option) any later version.
; The GNU MP Library is distributed in the hope that it will be useful, but
; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
; License for more details.
; You should have received a copy of the GNU Library General Public License
; along with the GNU MP Library; see the file COPYING.LIB. If not, write to
; the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
; INPUT PARAMETERS
; res_ptr gr26
; s1_ptr gr25
; s2_ptr gr24
; size gr23
; One might want to unroll this as for other processors, but it turns
; out that the data cache contention after a store makes such
; unrolling useless. We can't come under 5 cycles/limb anyway.
.code
.export __mpn_add_n
__mpn_add_n
.proc
.callinfo frame=0,no_calls
.entry
ldws,ma 4(0,%r25),%r20
ldws,ma 4(0,%r24),%r19
addib,= -1,%r23,L$end ; check for (SIZE == 1)
add %r20,%r19,%r28 ; add first limbs ignoring cy
L$loop ldws,ma 4(0,%r25),%r20
ldws,ma 4(0,%r24),%r19
stws,ma %r28,4(0,%r26)
addib,<> -1,%r23,L$loop
addc %r20,%r19,%r28
L$end stws %r28,0(0,%r26)
bv 0(%r2)
addc %r0,%r0,%r28
.exit
.procend

View File

@ -0,0 +1,101 @@
; HP-PA-1.1 __mpn_addmul_1 -- Multiply a limb vector with a limb and
; add the result to a second limb vector.
; Copyright (C) 1992, 1993, 1994 Free Software Foundation, Inc.
; This file is part of the GNU MP Library.
; The GNU MP Library is free software; you can redistribute it and/or modify
; it under the terms of the GNU Library General Public License as published by
; the Free Software Foundation; either version 2 of the License, or (at your
; option) any later version.
; The GNU MP Library is distributed in the hope that it will be useful, but
; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
; License for more details.
; You should have received a copy of the GNU Library General Public License
; along with the GNU MP Library; see the file COPYING.LIB. If not, write to
; the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
; INPUT PARAMETERS
; res_ptr r26
; s1_ptr r25
; size r24
; s2_limb r23
; This runs at 11 cycles/limb on a PA7000. With the used instructions, it
; can not become faster due to data cache contention after a store. On the
; PA7100 it runs at 10 cycles/limb, and that can not be improved either,
; since only the xmpyu does not need the integer pipeline, so the only
; dual-issue we will get are addc+xmpyu. Unrolling could gain a cycle/limb
; on the PA7100.
; There are some ideas described in mul_1.s that applies to this code too.
.code
.export __mpn_addmul_1
__mpn_addmul_1
.proc
.callinfo frame=64,no_calls
.entry
ldo 64(%r30),%r30
fldws,ma 4(%r25),%fr5
stw %r23,-16(%r30) ; move s2_limb ...
addib,= -1,%r24,L$just_one_limb
fldws -16(%r30),%fr4 ; ... into fr4
add %r0,%r0,%r0 ; clear carry
xmpyu %fr4,%fr5,%fr6
fldws,ma 4(%r25),%fr7
fstds %fr6,-16(%r30)
xmpyu %fr4,%fr7,%fr8
ldw -12(%r30),%r19 ; least significant limb in product
ldw -16(%r30),%r28
fstds %fr8,-16(%r30)
addib,= -1,%r24,L$end
ldw -12(%r30),%r1
; Main loop
L$loop ldws 0(%r26),%r29
fldws,ma 4(%r25),%fr5
add %r29,%r19,%r19
stws,ma %r19,4(%r26)
addc %r28,%r1,%r19
xmpyu %fr4,%fr5,%fr6
ldw -16(%r30),%r28
fstds %fr6,-16(%r30)
addc %r0,%r28,%r28
addib,<> -1,%r24,L$loop
ldw -12(%r30),%r1
L$end ldw 0(%r26),%r29
add %r29,%r19,%r19
stws,ma %r19,4(%r26)
addc %r28,%r1,%r19
ldw -16(%r30),%r28
ldws 0(%r26),%r29
addc %r0,%r28,%r28
add %r29,%r19,%r19
stws,ma %r19,4(%r26)
addc %r0,%r28,%r28
bv 0(%r2)
ldo -64(%r30),%r30
L$just_one_limb
xmpyu %fr4,%fr5,%fr6
ldw 0(%r26),%r29
fstds %fr6,-16(%r30)
ldw -12(%r30),%r1
ldw -16(%r30),%r28
add %r29,%r1,%r19
stw %r19,0(%r26)
addc %r0,%r28,%r28
bv 0(%r2)
ldo -64(%r30),%r30
.exit
.procend

View File

@ -0,0 +1,97 @@
; HP-PA-1.1 __mpn_mul_1 -- Multiply a limb vector with a limb and store
; the result in a second limb vector.
; Copyright (C) 1992, 1993, 1994 Free Software Foundation, Inc.
; This file is part of the GNU MP Library.
; The GNU MP Library is free software; you can redistribute it and/or modify
; it under the terms of the GNU Library General Public License as published by
; the Free Software Foundation; either version 2 of the License, or (at your
; option) any later version.
; The GNU MP Library is distributed in the hope that it will be useful, but
; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
; License for more details.
; You should have received a copy of the GNU Library General Public License
; along with the GNU MP Library; see the file COPYING.LIB. If not, write to
; the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
; INPUT PARAMETERS
; res_ptr r26
; s1_ptr r25
; size r24
; s2_limb r23
; This runs at 9 cycles/limb on a PA7000. With the used instructions, it can
; not become faster due to data cache contention after a store. On the
; PA7100 it runs at 7 cycles/limb, and that can not be improved either, since
; only the xmpyu does not need the integer pipeline, so the only dual-issue
; we will get are addc+xmpyu. Unrolling would not help either CPU.
; We could use fldds to read two limbs at a time from the S1 array, and that
; could bring down the times to 8.5 and 6.5 cycles/limb for the PA7000 and
; PA7100, respectively. We don't do that since it does not seem worth the
; (alignment) troubles...
; At least the PA7100 is rumored to be able to deal with cache-misses
; without stalling instruction issue. If this is true, and the cache is
; actually also lockup-free, we should use a deeper software pipeline, and
; load from S1 very early! (The loads and stores to -12(sp) will surely be
; in the cache.)
.code
.export __mpn_mul_1
__mpn_mul_1
.proc
.callinfo frame=64,no_calls
.entry
ldo 64(%r30),%r30
fldws,ma 4(%r25),%fr5
stw %r23,-16(%r30) ; move s2_limb ...
addib,= -1,%r24,L$just_one_limb
fldws -16(%r30),%fr4 ; ... into fr4
add %r0,%r0,%r0 ; clear carry
xmpyu %fr4,%fr5,%fr6
fldws,ma 4(%r25),%fr7
fstds %fr6,-16(%r30)
xmpyu %fr4,%fr7,%fr8
ldw -12(%r30),%r19 ; least significant limb in product
ldw -16(%r30),%r28
fstds %fr8,-16(%r30)
addib,= -1,%r24,L$end
ldw -12(%r30),%r1
; Main loop
L$loop fldws,ma 4(%r25),%fr5
stws,ma %r19,4(%r26)
addc %r28,%r1,%r19
xmpyu %fr4,%fr5,%fr6
ldw -16(%r30),%r28
fstds %fr6,-16(%r30)
addib,<> -1,%r24,L$loop
ldw -12(%r30),%r1
L$end stws,ma %r19,4(%r26)
addc %r28,%r1,%r19
ldw -16(%r30),%r28
stws,ma %r19,4(%r26)
addc %r0,%r28,%r28
bv 0(%r2)
ldo -64(%r30),%r30
L$just_one_limb
xmpyu %fr4,%fr5,%fr6
fstds %fr6,-16(%r30)
ldw -16(%r30),%r28
ldo -64(%r30),%r30
bv 0(%r2)
fstws %fr6R,0(%r26)
.exit
.procend

View File

@ -0,0 +1,110 @@
; HP-PA-1.1 __mpn_submul_1 -- Multiply a limb vector with a limb and
; subtract the result from a second limb vector.
; Copyright (C) 1992, 1993, 1994 Free Software Foundation, Inc.
; This file is part of the GNU MP Library.
; The GNU MP Library is free software; you can redistribute it and/or modify
; it under the terms of the GNU Library General Public License as published by
; the Free Software Foundation; either version 2 of the License, or (at your
; option) any later version.
; The GNU MP Library is distributed in the hope that it will be useful, but
; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
; License for more details.
; You should have received a copy of the GNU Library General Public License
; along with the GNU MP Library; see the file COPYING.LIB. If not, write to
; the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
; INPUT PARAMETERS
; res_ptr r26
; s1_ptr r25
; size r24
; s2_limb r23
; This runs at 12 cycles/limb on a PA7000. With the used instructions, it
; can not become faster due to data cache contention after a store. On the
; PA7100 it runs at 11 cycles/limb, and that can not be improved either,
; since only the xmpyu does not need the integer pipeline, so the only
; dual-issue we will get are addc+xmpyu. Unrolling could gain a cycle/limb
; on the PA7100.
; There are some ideas described in mul_1.s that applies to this code too.
; It seems possible to make this run as fast as __mpn_addmul_1, if we use
; sub,>>= %r29,%r19,%r22
; addi 1,%r28,%r28
; but that requires reworking the hairy software pipeline...
.code
.export __mpn_submul_1
__mpn_submul_1
.proc
.callinfo frame=64,no_calls
.entry
ldo 64(%r30),%r30
fldws,ma 4(%r25),%fr5
stw %r23,-16(%r30) ; move s2_limb ...
addib,= -1,%r24,L$just_one_limb
fldws -16(%r30),%fr4 ; ... into fr4
add %r0,%r0,%r0 ; clear carry
xmpyu %fr4,%fr5,%fr6
fldws,ma 4(%r25),%fr7
fstds %fr6,-16(%r30)
xmpyu %fr4,%fr7,%fr8
ldw -12(%r30),%r19 ; least significant limb in product
ldw -16(%r30),%r28
fstds %fr8,-16(%r30)
addib,= -1,%r24,L$end
ldw -12(%r30),%r1
; Main loop
L$loop ldws 0(%r26),%r29
fldws,ma 4(%r25),%fr5
sub %r29,%r19,%r22
add %r22,%r19,%r0
stws,ma %r22,4(%r26)
addc %r28,%r1,%r19
xmpyu %fr4,%fr5,%fr6
ldw -16(%r30),%r28
fstds %fr6,-16(%r30)
addc %r0,%r28,%r28
addib,<> -1,%r24,L$loop
ldw -12(%r30),%r1
L$end ldw 0(%r26),%r29
sub %r29,%r19,%r22
add %r22,%r19,%r0
stws,ma %r22,4(%r26)
addc %r28,%r1,%r19
ldw -16(%r30),%r28
ldws 0(%r26),%r29
addc %r0,%r28,%r28
sub %r29,%r19,%r22
add %r22,%r19,%r0
stws,ma %r22,4(%r26)
addc %r0,%r28,%r28
bv 0(%r2)
ldo -64(%r30),%r30
L$just_one_limb
xmpyu %fr4,%fr5,%fr6
ldw 0(%r26),%r29
fstds %fr6,-16(%r30)
ldw -12(%r30),%r1
ldw -16(%r30),%r28
sub %r29,%r1,%r22
add %r22,%r1,%r0
stw %r22,0(%r26)
addc %r0,%r28,%r28
bv 0(%r2)
ldo -64(%r30),%r30
.exit
.procend

View File

@ -0,0 +1,74 @@
; HP-PA __udiv_qrnnd division support, used from longlong.h.
; This version runs fast on PA 7000 and later.
; Copyright (C) 1993, 1994 Free Software Foundation, Inc.
; This file is part of the GNU MP Library.
; The GNU MP Library is free software; you can redistribute it and/or modify
; it under the terms of the GNU Library General Public License as published by
; the Free Software Foundation; either version 2 of the License, or (at your
; option) any later version.
; The GNU MP Library is distributed in the hope that it will be useful, but
; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
; License for more details.
; You should have received a copy of the GNU Library General Public License
; along with the GNU MP Library; see the file COPYING.LIB. If not, write to
; the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
; INPUT PARAMETERS
; rem_ptr gr26
; n1 gr25
; n0 gr24
; d gr23
.code
L$0000 .word 0x43f00000
.word 0x0
.export __udiv_qrnnd
__udiv_qrnnd
.proc
.callinfo frame=64,no_calls
.entry
ldo 64(%r30),%r30
stws %r25,-16(0,%r30) ; n_hi
stws %r24,-12(0,%r30) ; n_lo
ldil L'L$0000,%r19
ldo R'L$0000(%r19),%r19
fldds -16(0,%r30),%fr5
stws %r23,-12(0,%r30)
comib,<= 0,%r25,L$1
fcnvxf,dbl,dbl %fr5,%fr5
fldds 0(0,%r19),%fr4
fadd,dbl %fr4,%fr5,%fr5
L$1
fcpy,sgl %fr0,%fr6L
fldws -12(0,%r30),%fr6R
fcnvxf,dbl,dbl %fr6,%fr4
fdiv,dbl %fr5,%fr4,%fr5
fcnvfx,dbl,dbl %fr5,%fr4
fstws %fr4R,-16(%r30)
xmpyu %fr4R,%fr6R,%fr6
ldws -16(%r30),%r28
fstds %fr6,-16(0,%r30)
ldws -12(0,%r30),%r21
ldws -16(0,%r30),%r20
sub %r24,%r21,%r22
subb %r25,%r20,%r19
comib,= 0,%r19,L$2
ldo -64(%r30),%r30
add %r22,%r23,%r22
ldo -1(%r28),%r28
L$2 bv 0(%r2)
stws %r22,0(0,%r26)
.exit
.procend

65
sysdeps/hppa/lshift.s Normal file
View File

@ -0,0 +1,65 @@
; HP-PA __mpn_lshift --
; Copyright (C) 1992, 1994 Free Software Foundation, Inc.
; This file is part of the GNU MP Library.
; The GNU MP Library is free software; you can redistribute it and/or modify
; it under the terms of the GNU Library General Public License as published by
; the Free Software Foundation; either version 2 of the License, or (at your
; option) any later version.
; The GNU MP Library is distributed in the hope that it will be useful, but
; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
; License for more details.
; You should have received a copy of the GNU Library General Public License
; along with the GNU MP Library; see the file COPYING.LIB. If not, write to
; the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
; INPUT PARAMETERS
; res_ptr gr26
; s_ptr gr25
; size gr24
; cnt gr23
.code
.export __mpn_lshift
__mpn_lshift
.proc
.callinfo frame=64,no_calls
.entry
sh2add %r24,%r25,%r25
sh2add %r24,%r26,%r26
ldws,mb -4(0,%r25),%r22
subi 32,%r23,%r1
mtsar %r1
addib,= -1,%r24,L$0004
vshd %r0,%r22,%r28 ; compute carry out limb
ldws,mb -4(0,%r25),%r29
addib,= -1,%r24,L$0002
vshd %r22,%r29,%r20
L$loop ldws,mb -4(0,%r25),%r22
stws,mb %r20,-4(0,%r26)
addib,= -1,%r24,L$0003
vshd %r29,%r22,%r20
ldws,mb -4(0,%r25),%r29
stws,mb %r20,-4(0,%r26)
addib,<> -1,%r24,L$loop
vshd %r22,%r29,%r20
L$0002 stws,mb %r20,-4(0,%r26)
vshd %r29,%r0,%r20
bv 0(%r2)
stw %r20,-4(0,%r26)
L$0003 stws,mb %r20,-4(0,%r26)
L$0004 vshd %r22,%r0,%r20
bv 0(%r2)
stw %r20,-4(0,%r26)
.exit
.procend

62
sysdeps/hppa/rshift.s Normal file
View File

@ -0,0 +1,62 @@
; HP-PA __mpn_rshift --
; Copyright (C) 1992, 1994 Free Software Foundation, Inc.
; This file is part of the GNU MP Library.
; The GNU MP Library is free software; you can redistribute it and/or modify
; it under the terms of the GNU Library General Public License as published by
; the Free Software Foundation; either version 2 of the License, or (at your
; option) any later version.
; The GNU MP Library is distributed in the hope that it will be useful, but
; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
; License for more details.
; You should have received a copy of the GNU Library General Public License
; along with the GNU MP Library; see the file COPYING.LIB. If not, write to
; the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
; INPUT PARAMETERS
; res_ptr gr26
; s_ptr gr25
; size gr24
; cnt gr23
.code
.export __mpn_rshift
__mpn_rshift
.proc
.callinfo frame=64,no_calls
.entry
ldws,ma 4(0,%r25),%r22
mtsar %r23
addib,= -1,%r24,L$0004
vshd %r22,%r0,%r28 ; compute carry out limb
ldws,ma 4(0,%r25),%r29
addib,= -1,%r24,L$0002
vshd %r29,%r22,%r20
L$loop ldws,ma 4(0,%r25),%r22
stws,ma %r20,4(0,%r26)
addib,= -1,%r24,L$0003
vshd %r22,%r29,%r20
ldws,ma 4(0,%r25),%r29
stws,ma %r20,4(0,%r26)
addib,<> -1,%r24,L$loop
vshd %r29,%r22,%r20
L$0002 stws,ma %r20,4(0,%r26)
vshd %r0,%r29,%r20
bv 0(%r2)
stw %r20,0(0,%r26)
L$0003 stws,ma %r20,4(0,%r26)
L$0004 vshd %r0,%r22,%r20
bv 0(%r2)
stw %r20,0(0,%r26)
.exit
.procend

58
sysdeps/hppa/sub_n.s Normal file
View File

@ -0,0 +1,58 @@
; HP-PA __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
; store difference in a third limb vector.
; Copyright (C) 1992, 1994 Free Software Foundation, Inc.
; This file is part of the GNU MP Library.
; The GNU MP Library is free software; you can redistribute it and/or modify
; it under the terms of the GNU Library General Public License as published by
; the Free Software Foundation; either version 2 of the License, or (at your
; option) any later version.
; The GNU MP Library is distributed in the hope that it will be useful, but
; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
; License for more details.
; You should have received a copy of the GNU Library General Public License
; along with the GNU MP Library; see the file COPYING.LIB. If not, write to
; the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
; INPUT PARAMETERS
; res_ptr gr26
; s1_ptr gr25
; s2_ptr gr24
; size gr23
; One might want to unroll this as for other processors, but it turns
; out that the data cache contention after a store makes such
; unrolling useless. We can't come under 5 cycles/limb anyway.
.code
.export __mpn_sub_n
__mpn_sub_n
.proc
.callinfo frame=0,no_calls
.entry
ldws,ma 4(0,%r25),%r20
ldws,ma 4(0,%r24),%r19
addib,= -1,%r23,L$end ; check for (SIZE == 1)
sub %r20,%r19,%r28 ; subtract first limbs ignoring cy
L$loop ldws,ma 4(0,%r25),%r20
ldws,ma 4(0,%r24),%r19
stws,ma %r28,4(0,%r26)
addib,<> -1,%r23,L$loop
subb %r20,%r19,%r28
L$end stws %r28,0(0,%r26)
addc %r0,%r0,%r28
bv 0(%r2)
subi 1,%r28,%r28
.exit
.procend

285
sysdeps/hppa/udiv_qrnnd.s Normal file
View File

@ -0,0 +1,285 @@
; HP-PA __udiv_qrnnd division support, used from longlong.h.
; This version runs fast on pre-PA7000 CPUs.
; Copyright (C) 1993, 1994 Free Software Foundation, Inc.
; This file is part of the GNU MP Library.
; The GNU MP Library is free software; you can redistribute it and/or modify
; it under the terms of the GNU Library General Public License as published by
; the Free Software Foundation; either version 2 of the License, or (at your
; option) any later version.
; The GNU MP Library is distributed in the hope that it will be useful, but
; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
; License for more details.
; You should have received a copy of the GNU Library General Public License
; along with the GNU MP Library; see the file COPYING.LIB. If not, write to
; the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
; INPUT PARAMETERS
; rem_ptr gr26
; n1 gr25
; n0 gr24
; d gr23
; The code size is a bit excessive. We could merge the last two ds;addc
; sequences by simply moving the "bb,< Odd" instruction down. The only
; trouble is the FFFFFFFF code that would need some hacking.
.code
.export __udiv_qrnnd
__udiv_qrnnd
.proc
.callinfo frame=0,no_calls
.entry
comb,< %r23,0,L$largedivisor
sub %r0,%r23,%r1 ; clear cy as side-effect
ds %r0,%r1,%r0
addc %r24,%r24,%r24
ds %r25,%r23,%r25
addc %r24,%r24,%r24
ds %r25,%r23,%r25
addc %r24,%r24,%r24
ds %r25,%r23,%r25
addc %r24,%r24,%r24
ds %r25,%r23,%r25
addc %r24,%r24,%r24
ds %r25,%r23,%r25
addc %r24,%r24,%r24
ds %r25,%r23,%r25
addc %r24,%r24,%r24
ds %r25,%r23,%r25
addc %r24,%r24,%r24
ds %r25,%r23,%r25
addc %r24,%r24,%r24
ds %r25,%r23,%r25
addc %r24,%r24,%r24
ds %r25,%r23,%r25
addc %r24,%r24,%r24
ds %r25,%r23,%r25
addc %r24,%r24,%r24
ds %r25,%r23,%r25
addc %r24,%r24,%r24
ds %r25,%r23,%r25
addc %r24,%r24,%r24
ds %r25,%r23,%r25
addc %r24,%r24,%r24
ds %r25,%r23,%r25
addc %r24,%r24,%r24
ds %r25,%r23,%r25
addc %r24,%r24,%r24
ds %r25,%r23,%r25
addc %r24,%r24,%r24
ds %r25,%r23,%r25
addc %r24,%r24,%r24
ds %r25,%r23,%r25
addc %r24,%r24,%r24
ds %r25,%r23,%r25
addc %r24,%r24,%r24
ds %r25,%r23,%r25
addc %r24,%r24,%r24
ds %r25,%r23,%r25
addc %r24,%r24,%r24
ds %r25,%r23,%r25
addc %r24,%r24,%r24
ds %r25,%r23,%r25
addc %r24,%r24,%r24
ds %r25,%r23,%r25
addc %r24,%r24,%r24
ds %r25,%r23,%r25
addc %r24,%r24,%r24
ds %r25,%r23,%r25
addc %r24,%r24,%r24
ds %r25,%r23,%r25
addc %r24,%r24,%r24
ds %r25,%r23,%r25
addc %r24,%r24,%r24
ds %r25,%r23,%r25
addc %r24,%r24,%r24
ds %r25,%r23,%r25
addc %r24,%r24,%r28
ds %r25,%r23,%r25
comclr,>= %r25,%r0,%r0
addl %r25,%r23,%r25
stws %r25,0(0,%r26)
bv 0(%r2)
addc %r28,%r28,%r28
L$largedivisor
extru %r24,31,1,%r19 ; r19 = n0 & 1
bb,< %r23,31,L$odd
extru %r23,30,31,%r22 ; r22 = d >> 1
shd %r25,%r24,1,%r24 ; r24 = new n0
extru %r25,30,31,%r25 ; r25 = new n1
sub %r0,%r22,%r21
ds %r0,%r21,%r0
addc %r24,%r24,%r24
ds %r25,%r22,%r25
addc %r24,%r24,%r24
ds %r25,%r22,%r25
addc %r24,%r24,%r24
ds %r25,%r22,%r25
addc %r24,%r24,%r24
ds %r25,%r22,%r25
addc %r24,%r24,%r24
ds %r25,%r22,%r25
addc %r24,%r24,%r24
ds %r25,%r22,%r25
addc %r24,%r24,%r24
ds %r25,%r22,%r25
addc %r24,%r24,%r24
ds %r25,%r22,%r25
addc %r24,%r24,%r24
ds %r25,%r22,%r25
addc %r24,%r24,%r24
ds %r25,%r22,%r25
addc %r24,%r24,%r24
ds %r25,%r22,%r25
addc %r24,%r24,%r24
ds %r25,%r22,%r25
addc %r24,%r24,%r24
ds %r25,%r22,%r25
addc %r24,%r24,%r24
ds %r25,%r22,%r25
addc %r24,%r24,%r24
ds %r25,%r22,%r25
addc %r24,%r24,%r24
ds %r25,%r22,%r25
addc %r24,%r24,%r24
ds %r25,%r22,%r25
addc %r24,%r24,%r24
ds %r25,%r22,%r25
addc %r24,%r24,%r24
ds %r25,%r22,%r25
addc %r24,%r24,%r24
ds %r25,%r22,%r25
addc %r24,%r24,%r24
ds %r25,%r22,%r25
addc %r24,%r24,%r24
ds %r25,%r22,%r25
addc %r24,%r24,%r24
ds %r25,%r22,%r25
addc %r24,%r24,%r24
ds %r25,%r22,%r25
addc %r24,%r24,%r24
ds %r25,%r22,%r25
addc %r24,%r24,%r24
ds %r25,%r22,%r25
addc %r24,%r24,%r24
ds %r25,%r22,%r25
addc %r24,%r24,%r24
ds %r25,%r22,%r25
addc %r24,%r24,%r24
ds %r25,%r22,%r25
addc %r24,%r24,%r24
ds %r25,%r22,%r25
addc %r24,%r24,%r24
ds %r25,%r22,%r25
addc %r24,%r24,%r24
ds %r25,%r22,%r25
comclr,>= %r25,%r0,%r0
addl %r25,%r22,%r25
sh1addl %r25,%r19,%r25
stws %r25,0(0,%r26)
bv 0(%r2)
addc %r24,%r24,%r28
L$odd addib,sv,n 1,%r22,L$FF.. ; r22 = (d / 2 + 1)
shd %r25,%r24,1,%r24 ; r24 = new n0
extru %r25,30,31,%r25 ; r25 = new n1
sub %r0,%r22,%r21
ds %r0,%r21,%r0
addc %r24,%r24,%r24
ds %r25,%r22,%r25
addc %r24,%r24,%r24
ds %r25,%r22,%r25
addc %r24,%r24,%r24
ds %r25,%r22,%r25
addc %r24,%r24,%r24
ds %r25,%r22,%r25
addc %r24,%r24,%r24
ds %r25,%r22,%r25
addc %r24,%r24,%r24
ds %r25,%r22,%r25
addc %r24,%r24,%r24
ds %r25,%r22,%r25
addc %r24,%r24,%r24
ds %r25,%r22,%r25
addc %r24,%r24,%r24
ds %r25,%r22,%r25
addc %r24,%r24,%r24
ds %r25,%r22,%r25
addc %r24,%r24,%r24
ds %r25,%r22,%r25
addc %r24,%r24,%r24
ds %r25,%r22,%r25
addc %r24,%r24,%r24
ds %r25,%r22,%r25
addc %r24,%r24,%r24
ds %r25,%r22,%r25
addc %r24,%r24,%r24
ds %r25,%r22,%r25
addc %r24,%r24,%r24
ds %r25,%r22,%r25
addc %r24,%r24,%r24
ds %r25,%r22,%r25
addc %r24,%r24,%r24
ds %r25,%r22,%r25
addc %r24,%r24,%r24
ds %r25,%r22,%r25
addc %r24,%r24,%r24
ds %r25,%r22,%r25
addc %r24,%r24,%r24
ds %r25,%r22,%r25
addc %r24,%r24,%r24
ds %r25,%r22,%r25
addc %r24,%r24,%r24
ds %r25,%r22,%r25
addc %r24,%r24,%r24
ds %r25,%r22,%r25
addc %r24,%r24,%r24
ds %r25,%r22,%r25
addc %r24,%r24,%r24
ds %r25,%r22,%r25
addc %r24,%r24,%r24
ds %r25,%r22,%r25
addc %r24,%r24,%r24
ds %r25,%r22,%r25
addc %r24,%r24,%r24
ds %r25,%r22,%r25
addc %r24,%r24,%r24
ds %r25,%r22,%r25
addc %r24,%r24,%r24
ds %r25,%r22,%r25
addc %r24,%r24,%r24
ds %r25,%r22,%r25
addc %r24,%r24,%r28
comclr,>= %r25,%r0,%r0
addl %r25,%r22,%r25
sh1addl %r25,%r19,%r25
; We have computed (n1,,n0) / (d + 1), q' = r28, r' = r25
add,nuv %r28,%r25,%r25
addl %r25,%r1,%r25
addc %r0,%r28,%r28
sub,<< %r25,%r23,%r0
addl %r25,%r1,%r25
stws %r25,0(0,%r26)
bv 0(%r2)
addc %r0,%r28,%r28
; This is just a special case of the code above.
; We come here when d == 0xFFFFFFFF
L$FF.. add,uv %r25,%r24,%r24
sub,<< %r24,%r23,%r0
ldo 1(%r24),%r24
stws %r24,0(0,%r26)
bv 0(%r2)
addc %r0,%r25,%r28
.exit
.procend

View File

@ -1,7 +1,7 @@
/* i80386 __mpn_add_n -- Add two limb vectors of the same length > 0 and store
sum in a third limb vector.
Copyright (C) 1992, 1994 Free Software Foundation, Inc.
Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
@ -54,14 +54,18 @@ C_SYMBOL_NAME(__mpn_add_n:)
subl %eax,%edx /* ... enter the loop */
shrl $2,%eax /* restore previous value */
#ifdef PIC
call here
here: leal (Loop - 3 - here)(%eax,%eax,8),%eax
addl %eax,(%esp)
ret
/* Calculate start address in loop for PIC. Due to limitations in some
assemblers, Loop-L0-3 cannot be put into the leal */
call L0
L0: leal (%eax,%eax,8),%eax
addl (%esp),%eax
addl $(Loop-L0-3),%eax
addl $4,%esp
#else
leal (Loop - 3)(%eax,%eax,8),%eax /* calc start addr in loop */
jmp *%eax /* jump into loop */
/* Calculate start address in loop for non-PIC. */
leal (Loop - 3)(%eax,%eax,8),%eax
#endif
jmp *%eax /* jump into loop */
ALIGN (3)
Loop: movl (%esi),%eax
adcl (%edx),%eax

28
sysdeps/i386/gmp-mparam.h Normal file
View File

@ -0,0 +1,28 @@
/* gmp-mparam.h -- Compiler/machine parameter header file.
Copyright (C) 1991, 1993, 1994 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
it under the terms of the GNU Library General Public License as published by
the Free Software Foundation; either version 2 of the License, or (at your
option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
License for more details.
You should have received a copy of the GNU Library General Public License
along with the GNU MP Library; see the file COPYING.LIB. If not, write to
the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
#define BITS_PER_MP_LIMB 32
#define BYTES_PER_MP_LIMB 4
#define BITS_PER_LONGINT 32
#define BITS_PER_INT 32
#define BITS_PER_SHORTINT 16
#define BITS_PER_CHAR 8
#define IEEE_DOUBLE_BIG_ENDIAN 0

260
sysdeps/i386/i486/strcat.S Normal file
View File

@ -0,0 +1,260 @@
/* strcat(dest, src) -- Append SRC on the end of DEST.
For Intel 80x86, x>=4.
Copyright (C) 1994, 1995 Free Software Foundation, Inc.
Contributed by Ulrich Drepper <drepper@ipd.info.uni-karlsruhe.de>.
Optimised a little by Alan Modra <Alan@SPRI.Levels.UniSA.Edu.Au>
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Library General Public License as
published by the Free Software Foundation; either version 2 of the
License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Library General Public License for more details.
You should have received a copy of the GNU Library General Public
License along with the GNU C Library; see the file COPYING.LIB. If
not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
Boston, MA 02111-1307, USA. */
#include <sysdep.h>
#include "asm-syntax.h"
/*
INPUT PARAMETERS:
dest (sp + 4)
src (sp + 8)
*/
.text
ENTRY (strcat)
pushl %edi /* Save callee-safe register. */
movl 12(%esp), %ecx /* load source pointer */
movl 8(%esp), %edx /* load destination pointer */
testb $0xff, (%ecx) /* Is source string empty? */
jz L8 /* yes => return */
/* Test the first bytes separately until destination is aligned. */
testb $3, %edx /* destination pointer aligned? */
jz L1 /* yes => begin scan loop */
testb $0xff, (%edx) /* is end of string? */
jz L2 /* yes => start appending */
incl %edx /* increment source pointer */
testb $3, %edx /* destination pointer aligned? */
jz L1 /* yes => begin scan loop */
testb $0xff, (%edx) /* is end of string? */
jz L2 /* yes => start appending */
incl %edx /* increment source pointer */
testb $3, %edx /* destination pointer aligned? */
jz L1 /* yes => begin scan loop */
testb $0xff, (%edx) /* is end of string? */
jz L2 /* yes => start appending */
incl %edx /* increment source pointer */
/* Now we are aligned. Begin scan loop. */
jmp L1
ALIGN(4)
L4: addl $16,%edx /* increment destination pointer for round */
L1: movl (%edx), %eax /* get word (= 4 bytes) in question */
movl $0xfefefeff, %edi /* magic value */
/* If you compare this with the algorithm in memchr.S you will
notice that here is an `xorl' statement missing. But you must
not forget that we are looking for C == 0 and `xorl $0, %eax'
is a no-op. */
addl %eax, %edi /* add the magic value to the word. We get
carry bits reported for each byte which
is *not* 0 */
/* According to the algorithm we had to reverse the effect of the
XOR first and then test the overflow bits. But because the
following XOR would destroy the carry flag and it would (in a
representation with more than 32 bits) not alter then last
overflow, we can now test this condition. If no carry is signaled
no overflow must have occured in the last byte => it was 0. */
jnc L3
/* We are only interested in carry bits that change due to the
previous add, so remove original bits */
xorl %eax, %edi /* ((word^charmask)+magic)^(word^charmask) */
/* Now test for the other three overflow bits. */
orl $0xfefefeff, %edi /* set all non-carry bits */
incl %edi /* add 1: if one carry bit was *not* set
the addition will not result in 0. */
/* If at least one byte of the word is C we don't get 0 in %ecx. */
jnz L3
movl 4(%edx), %eax /* get word from source */
movl $0xfefefeff, %edi /* magic value */
addl %eax, %edi /* add the magic value to the word. We get
carry bits reported for each byte which
is *not* 0 */
jnc L5 /* highest byte is C => stop copying */
xorl %eax, %edi /* ((word^charmask)+magic)^(word^charmask) */
orl $0xfefefeff, %edi /* set all non-carry bits */
incl %edi /* add 1: if one carry bit was *not* set
the addition will not result in 0. */
jnz L5 /* one byte is NUL => stop copying */
movl 8(%edx), %eax /* get word from source */
movl $0xfefefeff, %edi /* magic value */
addl %eax, %edi /* add the magic value to the word. We get
carry bits reported for each byte which
is *not* 0 */
jnc L6 /* highest byte is C => stop copying */
xorl %eax, %edi /* ((word^charmask)+magic)^(word^charmask) */
orl $0xfefefeff, %edi /* set all non-carry bits */
incl %edi /* add 1: if one carry bit was *not* set
the addition will not result in 0. */
jnz L6 /* one byte is NUL => stop copying */
movl 12(%edx), %eax /* get word from source */
movl $0xfefefeff, %edi /* magic value */
addl %eax, %edi /* add the magic value to the word. We get
carry bits reported for each byte which
is *not* 0 */
jnc L7 /* highest byte is C => stop copying */
xorl %eax, %edi /* ((word^charmask)+magic)^(word^charmask) */
orl $0xfefefeff, %edi /* set all non-carry bits */
incl %edi /* add 1: if one carry bit was *not* set
the addition will not result in 0. */
jz L4 /* no byte is NUL => carry on copying */
L7: addl $4, %edx /* adjust source pointer */
L6: addl $4, %edx
L5: addl $4, %edx
L3: testb %al, %al /* is first byte NUL? */
jz L2 /* yes => start copying */
incl %edx /* increment source pointer */
testb %ah, %ah /* is second byte NUL? */
jz L2 /* yes => start copying */
incl %edx /* increment source pointer */
testl $0xff0000, %eax /* is third byte NUL? */
jz L2 /* yes => start copying */
incl %edx /* increment source pointer */
L2: subl %ecx, %edx /* reduce number of loop variants */
/* Now we have to align the source pointer. */
testb $3, %ecx /* pointer correctly aligned? */
jz L29 /* yes => start copy loop */
movb (%ecx), %al /* get first byte */
movb %al, (%ecx,%edx) /* and store it */
andl %al, %al /* is byte NUL? */
jz L8 /* yes => return */
incl %ecx /* increment pointer */
testb $3, %ecx /* pointer correctly aligned? */
jz L29 /* yes => start copy loop */
movb (%ecx), %al /* get first byte */
movb %al, (%ecx,%edx) /* and store it */
andl %al, %al /* is byte NUL? */
jz L8 /* yes => return */
incl %ecx /* increment pointer */
testb $3, %ecx /* pointer correctly aligned? */
jz L29 /* yes => start copy loop */
movb (%ecx), %al /* get first byte */
movb %al, (%ecx,%edx) /* and store it */
andl %al, %al /* is byte NUL? */
jz L8 /* yes => return */
incl %ecx /* increment pointer */
/* Now we are aligned. */
jmp L29 /* start copy loop */
ALIGN(4)
L28: movl %eax, 12(%ecx,%edx)/* store word at destination */
addl $16, %ecx /* adjust pointer for full round */
L29: movl (%ecx), %eax /* get word from source */
movl $0xfefefeff, %edi /* magic value */
addl %eax, %edi /* add the magic value to the word. We get
carry bits reported for each byte which
is *not* 0 */
jnc L9 /* highest byte is C => stop copying */
xorl %eax, %edi /* ((word^charmask)+magic)^(word^charmask) */
orl $0xfefefeff, %edi /* set all non-carry bits */
incl %edi /* add 1: if one carry bit was *not* set
the addition will not result in 0. */
jnz L9 /* one byte is NUL => stop copying */
movl %eax, (%ecx,%edx) /* store word to destination */
movl 4(%ecx), %eax /* get word from source */
movl $0xfefefeff, %edi /* magic value */
addl %eax, %edi /* add the magic value to the word. We get
carry bits reported for each byte which
is *not* 0 */
jnc L91 /* highest byte is C => stop copying */
xorl %eax, %edi /* ((word^charmask)+magic)^(word^charmask) */
orl $0xfefefeff, %edi /* set all non-carry bits */
incl %edi /* add 1: if one carry bit was *not* set
the addition will not result in 0. */
jnz L91 /* one byte is NUL => stop copying */
movl %eax, 4(%ecx,%edx) /* store word to destination */
movl 8(%ecx), %eax /* get word from source */
movl $0xfefefeff, %edi /* magic value */
addl %eax, %edi /* add the magic value to the word. We get
carry bits reported for each byte which
is *not* 0 */
jnc L92 /* highest byte is C => stop copying */
xorl %eax, %edi /* ((word^charmask)+magic)^(word^charmask) */
orl $0xfefefeff, %edi /* set all non-carry bits */
incl %edi /* add 1: if one carry bit was *not* set
the addition will not result in 0. */
jnz L92 /* one byte is NUL => stop copying */
movl %eax, 8(%ecx,%edx) /* store word to destination */
movl 12(%ecx), %eax /* get word from source */
movl $0xfefefeff, %edi /* magic value */
addl %eax, %edi /* add the magic value to the word. We get
carry bits reported for each byte which
is *not* 0 */
jnc L93 /* highest byte is C => stop copying */
xorl %eax, %edi /* ((word^charmask)+magic)^(word^charmask) */
orl $0xfefefeff, %edi /* set all non-carry bits */
incl %edi /* add 1: if one carry bit was *not* set
the addition will not result in 0. */
jz L28 /* no is NUL => carry on copying */
L93: addl $4, %ecx /* adjust pointer */
L92: addl $4, %ecx
L91: addl $4, %ecx
L9: movb %al, (%ecx,%edx) /* store first byte of last word */
orb %al, %al /* is it NUL? */
jz L8 /* yes => return */
movb %ah, 1(%ecx,%edx) /* store second byte of last word */
orb %ah, %ah /* is it NUL? */
jz L8 /* yes => return */
shrl $16, %eax /* make upper bytes accessible */
movb %al, 2(%ecx,%edx) /* store third byte of last word */
orb %al, %al /* is it NUL? */
jz L8 /* yes => return */
movb %ah, 3(%ecx,%edx) /* store fourth byte of last word */
L8: movl 8(%esp), %eax /* start address of destination is result */
popl %edi /* restore saved register */
ret

132
sysdeps/i386/i486/strlen.S Normal file
View File

@ -0,0 +1,132 @@
/* strlen(str) -- determine the length of the string STR.
Optimized for Intel 80x86, x>=4.
Copyright (C) 1991, 1992, 1993, 1994, 1995 Free Software Foundation, Inc.
Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Library General Public License as
published by the Free Software Foundation; either version 2 of the
License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Library General Public License for more details.
You should have received a copy of the GNU Library General Public
License along with the GNU C Library; see the file COPYING.LIB. If
not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
Boston, MA 02111-1307, USA. */
#include <sysdep.h>
#include "asm-syntax.h"
/*
INPUT PARAMETERS:
str (sp + 4)
*/
.text
ENTRY (strlen)
movl 4(%esp), %ecx /* get string pointer */
movl %ecx, %eax /* duplicate it */
andl $3, %ecx /* mask alignment bits */
jz L1 /* aligned => start loop */
cmpb %ch, (%eax) /* is byte NUL? */
je L2 /* yes => return */
incl %eax /* increment pointer */
xorl $3, %ecx /* was alignment = 3? */
jz L1 /* yes => now it is aligned and start loop */
cmpb %ch, (%eax) /* is byte NUL? */
je L2 /* yes => return */
addl $1, %eax /* increment pointer */
subl $1, %ecx /* was alignment = 2? */
jz L1 /* yes => now it is aligned and start loop */
cmpb %ch, (%eax) /* is byte NUL? */
je L2 /* yes => return */
/* Don't change the above `addl $1,%eax' and `subl $1, %ecx' into `incl %eax'
and `decl %ecx' resp. The additional two byte per instruction make the
label 4 to be aligned on a 16 byte boundary with nops.
The following `sub $15, %eax' is part of this trick, too. Together with
the next instruction (`addl $16, %eax') it is in fact a `incl %eax', just
as expected from the algorithm. But doing so has the advantage that
no jump to label 1 is necessary and so the pipeline is not flushed. */
subl $15, %eax /* effectively +1 */
L4: addl $16, %eax /* adjust pointer for full loop */
L1: movl (%eax), %ecx /* get word (= 4 bytes) in question */
movl $0xfefefeff, %edx /* magic value */
addl %ecx, %edx /* add the magic value to the word. We get
carry bits reported for each byte which
is *not* 0 */
jnc L3 /* highest byte is NUL => return pointer */
xorl %ecx, %edx /* (word+magic)^word */
orl $0xfefefeff, %edx /* set all non-carry bits */
incl %edx /* add 1: if one carry bit was *not* set
the addition will not result in 0. */
jnz L3 /* found NUL => return pointer */
movl 4(%eax), %ecx /* get word (= 4 bytes) in question */
movl $0xfefefeff, %edx /* magic value */
addl %ecx, %edx /* add the magic value to the word. We get
carry bits reported for each byte which
is *not* 0 */
jnc L5 /* highest byte is NUL => return pointer */
xorl %ecx, %edx /* (word+magic)^word */
orl $0xfefefeff, %edx /* set all non-carry bits */
incl %edx /* add 1: if one carry bit was *not* set
the addition will not result in 0. */
jnz L5 /* found NUL => return pointer */
movl 8(%eax), %ecx /* get word (= 4 bytes) in question */
movl $0xfefefeff, %edx /* magic value */
addl %ecx, %edx /* add the magic value to the word. We get
carry bits reported for each byte which
is *not* 0 */
jnc L6 /* highest byte is NUL => return pointer */
xorl %ecx, %edx /* (word+magic)^word */
orl $0xfefefeff, %edx /* set all non-carry bits */
incl %edx /* add 1: if one carry bit was *not* set
the addition will not result in 0. */
jnz L6 /* found NUL => return pointer */
movl 12(%eax), %ecx /* get word (= 4 bytes) in question */
movl $0xfefefeff, %edx /* magic value */
addl %ecx, %edx /* add the magic value to the word. We get
carry bits reported for each byte which
is *not* 0 */
jnc L7 /* highest byte is NUL => return pointer */
xorl %ecx, %edx /* (word+magic)^word */
orl $0xfefefeff, %edx /* set all non-carry bits */
incl %edx /* add 1: if one carry bit was *not* set
the addition will not result in 0. */
jz L4 /* no NUL found => continue loop */
L7: addl $4, %eax /* adjust pointer */
L6: addl $4, %eax
L5: addl $4, %eax
L3: testb %cl, %cl /* is first byte NUL? */
jz L2 /* yes => return */
incl %eax /* increment pointer */
testb %ch, %ch /* is second byte NUL? */
jz L2 /* yes => return */
incl %eax /* increment pointer */
testl $0xff0000, %ecx /* is third byte NUL? */
jz L2 /* yes => return pointer */
incl %eax /* increment pointer */
L2: subl 4(%esp), %eax /* compute difference to string start */
ret

View File

@ -0,0 +1,2 @@
# Code optimized for i486 is better than simple i386 code.
i386/i486

136
sysdeps/i386/i586/add_n.S Normal file
View File

@ -0,0 +1,136 @@
/* Pentium __mpn_add_n -- Add two limb vectors of the same length > 0 and store
sum in a third limb vector.
Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
it under the terms of the GNU Library General Public License as published by
the Free Software Foundation; either version 2 of the License, or (at your
option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
License for more details.
You should have received a copy of the GNU Library General Public License
along with the GNU MP Library; see the file COPYING.LIB. If not, write to
the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
/*
INPUT PARAMETERS
res_ptr (sp + 4)
s1_ptr (sp + 8)
s2_ptr (sp + 12)
size (sp + 16)
*/
#define r1 %eax
#define r2 %edx
#define src1 %esi
#define src2 %ebp
#define dst %edi
#define x %ebx
#include "sysdep.h"
#include "asm-syntax.h"
.text
ALIGN (3)
.globl C_SYMBOL_NAME(__mpn_add_n)
C_SYMBOL_NAME(__mpn_add_n:)
pushl %edi
pushl %esi
pushl %ebx
pushl %ebp
movl 20(%esp),dst /* res_ptr */
movl 24(%esp),src1 /* s1_ptr */
movl 28(%esp),src2 /* s2_ptr */
movl 32(%esp),%ecx /* size */
movl (src2),x
decl %ecx
movl %ecx,r2
shrl $3,%ecx
andl $7,r2
testl %ecx,%ecx /* zero carry flag */
jz Lend
pushl r2
ALIGN (3)
Loop: movl 28(dst),%eax /* fetch destination cache line */
leal 32(dst),dst
L1: movl (src1),r1
movl 4(src1),r2
adcl x,r1
movl 4(src2),x
adcl x,r2
movl 8(src2),x
movl r1,-32(dst)
movl r2,-28(dst)
L2: movl 8(src1),r1
movl 12(src1),r2
adcl x,r1
movl 12(src2),x
adcl x,r2
movl 16(src2),x
movl r1,-24(dst)
movl r2,-20(dst)
L3: movl 16(src1),r1
movl 20(src1),r2
adcl x,r1
movl 20(src2),x
adcl x,r2
movl 24(src2),x
movl r1,-16(dst)
movl r2,-12(dst)
L4: movl 24(src1),r1
movl 28(src1),r2
adcl x,r1
movl 28(src2),x
adcl x,r2
movl 32(src2),x
movl r1,-8(dst)
movl r2,-4(dst)
leal 32(src1),src1
leal 32(src2),src2
decl %ecx
jnz Loop
popl r2
Lend:
decl r2 /* test r2 w/o clobbering carry */
js Lend2
incl r2
Loop2:
leal 4(dst),dst
movl (src1),r1
adcl x,r1
movl 4(src2),x
movl r1,-4(dst)
leal 4(src1),src1
leal 4(src2),src2
decl r2
jnz Loop2
Lend2:
movl (src1),r1
adcl x,r1
movl r1,(dst)
sbbl %eax,%eax
negl %eax
popl %ebp
popl %ebx
popl %esi
popl %edi
ret

View File

@ -0,0 +1,84 @@
/* Pentium __mpn_addmul_1 -- Multiply a limb vector with a limb and add
the result to a second limb vector.
Copyright (C) 1992, 1994 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
it under the terms of the GNU Library General Public License as published by
the Free Software Foundation; either version 2 of the License, or (at your
option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
License for more details.
You should have received a copy of the GNU Library General Public License
along with the GNU MP Library; see the file COPYING.LIB. If not, write to
the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
/*
INPUT PARAMETERS
res_ptr (sp + 4)
s1_ptr (sp + 8)
size (sp + 12)
s2_limb (sp + 16)
*/
#include "sysdep.h"
#include "asm-syntax.h"
#define res_ptr edi
#define s1_ptr esi
#define s2_limb ebp
TEXT
ALIGN (3)
GLOBL C_SYMBOL_NAME(__mpn_addmul_1)
.type C_SYMBOL_NAME(__mpn_addmul_1),@function
C_SYMBOL_NAME(__mpn_addmul_1:)
INSN1(push,l ,R(edi))
INSN1(push,l ,R(esi))
INSN1(push,l ,R(ebx))
INSN1(push,l ,R(ebp))
INSN2(mov,l ,R(res_ptr),MEM_DISP(esp,20))
INSN2(mov,l ,R(s1_ptr),MEM_DISP(esp,24))
INSN2(mov,l ,R(ecx),MEM_DISP(esp,28))
INSN2(mov,l ,R(s2_limb),MEM_DISP(esp,32))
INSN2(lea,l ,R(res_ptr),MEM_INDEX(res_ptr,ecx,4))
INSN2(lea,l ,R(s1_ptr),MEM_INDEX(s1_ptr,ecx,4))
INSN1(neg,l ,R(ecx))
INSN2(xor,l ,R(edx),R(edx))
ALIGN (3)
Loop:
INSN2(mov,l ,R(ebx),R(edx))
INSN2(mov,l ,R(eax),MEM_INDEX(s1_ptr,ecx,4))
INSN1(mul,l ,R(s2_limb))
INSN2(add,l ,R(eax),R(ebx))
INSN2(mov,l ,R(ebx),MEM_INDEX(res_ptr,ecx,4))
INSN2(adc,l ,R(edx),$0)
INSN2(add,l ,R(ebx),R(eax))
INSN2(adc,l ,R(edx),$0)
INSN2(mov,l ,MEM_INDEX(res_ptr,ecx,4),R(ebx))
INSN1(inc,l ,R(ecx))
INSN1(jnz, ,Loop)
INSN2(mov,l ,R(eax),R(edx))
INSN1(pop,l ,R(ebp))
INSN1(pop,l ,R(ebx))
INSN1(pop,l ,R(esi))
INSN1(pop,l ,R(edi))
ret
Lfe1:
.size C_SYMBOL_NAME(__mpn_addmul_1),Lfe1-C_SYMBOL_NAME(__mpn_addmul_1)

213
sysdeps/i386/i586/lshift.S Normal file
View File

@ -0,0 +1,213 @@
/* Pentium optimized __mpn_lshift --
Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
it under the terms of the GNU Library General Public License as published by
the Free Software Foundation; either version 2 of the License, or (at your
option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
License for more details.
You should have received a copy of the GNU Library General Public License
along with the GNU MP Library; see the file COPYING.LIB. If not, write to
the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
/*
INPUT PARAMETERS
res_ptr (sp + 4)
s_ptr (sp + 8)
size (sp + 12)
cnt (sp + 16)
*/
#include "sysdep.h"
#include "asm-syntax.h"
.text
ALIGN (3)
.globl C_SYMBOL_NAME(__mpn_lshift)
C_SYMBOL_NAME(__mpn_lshift:)
pushl %edi
pushl %esi
pushl %ebx
pushl %ebp
movl 20(%esp),%edi /* res_ptr */
movl 24(%esp),%esi /* s_ptr */
movl 28(%esp),%ebp /* size */
movl 32(%esp),%ecx /* cnt */
cmp $1,%ecx
jne Lnormal
movl %edi,%eax
subl %esi,%eax
cmpl %ebp,%eax
jnc Lspecial
Lnormal:
leal -4(%edi,%ebp,4),%edi
leal -4(%esi,%ebp,4),%esi
movl (%esi),%edx
subl $4,%esi
xorl %eax,%eax
shldl %cl,%edx,%eax /* compute carry limb */
pushl %eax /* push carry limb onto stack */
decl %ebp
pushl %ebp
shrl $3,%ebp
jz Lend
movl (%edi),%eax /* fetch destination cache line */
ALIGN (2)
Loop: movl -28(%edi),%eax /* fetch destination cache line */
movl %edx,%ebx
movl (%esi),%eax
movl -4(%esi),%edx
shldl %cl,%eax,%ebx
shldl %cl,%edx,%eax
movl %ebx,(%edi)
movl %eax,-4(%edi)
movl -8(%esi),%ebx
movl -12(%esi),%eax
shldl %cl,%ebx,%edx
shldl %cl,%eax,%ebx
movl %edx,-8(%edi)
movl %ebx,-12(%edi)
movl -16(%esi),%edx
movl -20(%esi),%ebx
shldl %cl,%edx,%eax
shldl %cl,%ebx,%edx
movl %eax,-16(%edi)
movl %edx,-20(%edi)
movl -24(%esi),%eax
movl -28(%esi),%edx
shldl %cl,%eax,%ebx
shldl %cl,%edx,%eax
movl %ebx,-24(%edi)
movl %eax,-28(%edi)
subl $32,%esi
subl $32,%edi
decl %ebp
jnz Loop
Lend: popl %ebp
andl $7,%ebp
jz Lend2
Loop2: movl (%esi),%eax
shldl %cl,%eax,%edx
movl %edx,(%edi)
movl %eax,%edx
subl $4,%esi
subl $4,%edi
decl %ebp
jnz Loop2
Lend2: shll %cl,%edx /* compute least significant limb */
movl %edx,(%edi) /* store it */
popl %eax /* pop carry limb */
popl %ebp
popl %ebx
popl %esi
popl %edi
ret
/* We loop from least significant end of the arrays, which is only
permissable if the source and destination don't overlap, since the
function is documented to work for overlapping source and destination.
*/
Lspecial:
movl (%esi),%edx
addl $4,%esi
decl %ebp
pushl %ebp
shrl $3,%ebp
addl %edx,%edx
incl %ebp
decl %ebp
jz LLend
movl (%edi),%eax /* fetch destination cache line */
ALIGN (2)
LLoop: movl 28(%edi),%eax /* fetch destination cache line */
movl %edx,%ebx
movl (%esi),%eax
movl 4(%esi),%edx
adcl %eax,%eax
movl %ebx,(%edi)
adcl %edx,%edx
movl %eax,4(%edi)
movl 8(%esi),%ebx
movl 12(%esi),%eax
adcl %ebx,%ebx
movl %edx,8(%edi)
adcl %eax,%eax
movl %ebx,12(%edi)
movl 16(%esi),%edx
movl 20(%esi),%ebx
adcl %edx,%edx
movl %eax,16(%edi)
adcl %ebx,%ebx
movl %edx,20(%edi)
movl 24(%esi),%eax
movl 28(%esi),%edx
adcl %eax,%eax
movl %ebx,24(%edi)
adcl %edx,%edx
movl %eax,28(%edi)
leal 32(%esi),%esi /* use leal not to clobber carry */
leal 32(%edi),%edi
decl %ebp
jnz LLoop
LLend: popl %ebp
sbbl %eax,%eax /* save carry in %eax */
andl $7,%ebp
jz LLend2
addl %eax,%eax /* restore carry from eax */
LLoop2: movl %edx,%ebx
movl (%esi),%edx
adcl %edx,%edx
movl %ebx,(%edi)
leal 4(%esi),%esi /* use leal not to clobber carry */
leal 4(%edi),%edi
decl %ebp
jnz LLoop2
jmp LL1
LLend2: addl %eax,%eax /* restore carry from eax */
LL1: movl %edx,(%edi) /* store last limb */
sbbl %eax,%eax
negl %eax
popl %ebp
popl %ebx
popl %esi
popl %edi
ret

View File

@ -1,5 +1,5 @@
/* memcopy.h -- definitions for memory copy functions. Pentium version.
Copyright (C) 1994 Free Software Foundation, Inc.
Copyright (C) 1994, 1995 Free Software Foundation, Inc.
Contributed by Torbjorn Granlund (tege@sics.se).
This file is part of the GNU C Library.
@ -88,7 +88,7 @@ Cambridge, MA 02139, USA. */
"subl $32,%2\n" \
"jns 1b\n" \
"2: addl $32,%2" : \
"=r" (dst_bp), "=r" (src_bp), "=r" (nbytes_left) : \
"0" (dst_bp), "1" (src_bp), "2" (nbytes) : \
"=r" (dst_ep), "=r" (src_ep), "=r" (nbytes_left) : \
"0" (dst_ep), "1" (src_ep), "2" (nbytes) : \
"ax", "dx"); \
} while (0)

78
sysdeps/i386/i586/mul_1.S Normal file
View File

@ -0,0 +1,78 @@
/* Pentium __mpn_mul_1 -- Multiply a limb vector with a limb and store
the result in a second limb vector.
Copyright (C) 1992, 1994 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
it under the terms of the GNU Library General Public License as published by
the Free Software Foundation; either version 2 of the License, or (at your
option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
License for more details.
You should have received a copy of the GNU Library General Public License
along with the GNU MP Library; see the file COPYING.LIB. If not, write to
the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
/*
INPUT PARAMETERS
res_ptr (sp + 4)
s1_ptr (sp + 8)
size (sp + 12)
s2_limb (sp + 16)
*/
#include "sysdep.h"
#include "asm-syntax.h"
#define res_ptr edi
#define s1_ptr esi
#define size ecx
#define s2_limb ebp
TEXT
ALIGN (3)
GLOBL C_SYMBOL_NAME(__mpn_mul_1)
C_SYMBOL_NAME(__mpn_mul_1:)
INSN1(push,l ,R(edi))
INSN1(push,l ,R(esi))
INSN1(push,l ,R(ebx))
INSN1(push,l ,R(ebp))
INSN2(mov,l ,R(res_ptr),MEM_DISP(esp,20))
INSN2(mov,l ,R(s1_ptr),MEM_DISP(esp,24))
INSN2(mov,l ,R(size),MEM_DISP(esp,28))
INSN2(mov,l ,R(s2_limb),MEM_DISP(esp,32))
INSN2(lea,l ,R(res_ptr),MEM_INDEX(res_ptr,size,4))
INSN2(lea,l ,R(s1_ptr),MEM_INDEX(s1_ptr,size,4))
INSN1(neg,l ,R(size))
INSN2(xor,l ,R(edx),R(edx))
ALIGN (3)
Loop:
INSN2(mov,l ,R(ebx),R(edx))
INSN2(mov,l ,R(eax),MEM_INDEX(s1_ptr,size,4))
INSN1(mul,l ,R(s2_limb))
INSN2(add,l ,R(eax),R(ebx))
INSN2(adc,l ,R(edx),$0)
INSN2(mov,l ,MEM_INDEX(res_ptr,size,4),R(eax))
INSN1(inc,l ,R(size))
INSN1(jnz, ,Loop)
INSN2(mov,l ,R(eax),R(edx))
INSN1(pop,l ,R(ebp))
INSN1(pop,l ,R(ebx))
INSN1(pop,l ,R(esi))
INSN1(pop,l ,R(edi))
ret

213
sysdeps/i386/i586/rshift.S Normal file
View File

@ -0,0 +1,213 @@
/* Pentium optimized __mpn_rshift --
Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
it under the terms of the GNU Library General Public License as published by
the Free Software Foundation; either version 2 of the License, or (at your
option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
License for more details.
You should have received a copy of the GNU Library General Public License
along with the GNU MP Library; see the file COPYING.LIB. If not, write to
the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
/*
INPUT PARAMETERS
res_ptr (sp + 4)
s_ptr (sp + 8)
size (sp + 12)
cnt (sp + 16)
*/
#include "sysdep.h"
#include "asm-syntax.h"
.text
ALIGN (3)
.globl C_SYMBOL_NAME(__mpn_rshift)
C_SYMBOL_NAME(__mpn_rshift:)
pushl %edi
pushl %esi
pushl %ebx
pushl %ebp
movl 20(%esp),%edi /* res_ptr */
movl 24(%esp),%esi /* s_ptr */
movl 28(%esp),%ebp /* size */
movl 32(%esp),%ecx /* cnt */
cmp $1,%ecx
jne Lnormal
movl %edi,%eax
subl %esi,%eax
cmpl %ebp,%eax
jnc Lspecial
Lnormal:
movl (%esi),%edx
addl $4,%esi
xorl %eax,%eax
shrdl %cl,%edx,%eax /* compute carry limb */
pushl %eax /* push carry limb onto stack */
decl %ebp
pushl %ebp
shrl $3,%ebp
jz Lend
movl (%edi),%eax /* fetch destination cache line */
ALIGN (2)
Loop: movl 28(%edi),%eax /* fetch destination cache line */
movl %edx,%ebx
movl (%esi),%eax
movl 4(%esi),%edx
shrdl %cl,%eax,%ebx
shrdl %cl,%edx,%eax
movl %ebx,(%edi)
movl %eax,4(%edi)
movl 8(%esi),%ebx
movl 12(%esi),%eax
shrdl %cl,%ebx,%edx
shrdl %cl,%eax,%ebx
movl %edx,8(%edi)
movl %ebx,12(%edi)
movl 16(%esi),%edx
movl 20(%esi),%ebx
shrdl %cl,%edx,%eax
shrdl %cl,%ebx,%edx
movl %eax,16(%edi)
movl %edx,20(%edi)
movl 24(%esi),%eax
movl 28(%esi),%edx
shrdl %cl,%eax,%ebx
shrdl %cl,%edx,%eax
movl %ebx,24(%edi)
movl %eax,28(%edi)
addl $32,%esi
addl $32,%edi
decl %ebp
jnz Loop
Lend: popl %ebp
andl $7,%ebp
jz Lend2
Loop2: movl (%esi),%eax
shrdl %cl,%eax,%edx /* compute result limb */
movl %edx,(%edi)
movl %eax,%edx
addl $4,%esi
addl $4,%edi
decl %ebp
jnz Loop2
Lend2: shrl %cl,%edx /* compute most significant limb */
movl %edx,(%edi) /* store it */
popl %eax /* pop carry limb */
popl %ebp
popl %ebx
popl %esi
popl %edi
ret
/* We loop from least significant end of the arrays, which is only
permissable if the source and destination don't overlap, since the
function is documented to work for overlapping source and destination.
*/
Lspecial:
leal -4(%edi,%ebp,4),%edi
leal -4(%esi,%ebp,4),%esi
movl (%esi),%edx
subl $4,%esi
decl %ebp
pushl %ebp
shrl $3,%ebp
shrl $1,%edx
incl %ebp
decl %ebp
jz LLend
movl (%edi),%eax /* fetch destination cache line */
ALIGN (2)
LLoop: movl -28(%edi),%eax /* fetch destination cache line */
movl %edx,%ebx
movl (%esi),%eax
movl -4(%esi),%edx
rcrl $1,%eax
movl %ebx,(%edi)
rcrl $1,%edx
movl %eax,-4(%edi)
movl -8(%esi),%ebx
movl -12(%esi),%eax
rcrl $1,%ebx
movl %edx,-8(%edi)
rcrl $1,%eax
movl %ebx,-12(%edi)
movl -16(%esi),%edx
movl -20(%esi),%ebx
rcrl $1,%edx
movl %eax,-16(%edi)
rcrl $1,%ebx
movl %edx,-20(%edi)
movl -24(%esi),%eax
movl -28(%esi),%edx
rcrl $1,%eax
movl %ebx,-24(%edi)
rcrl $1,%edx
movl %eax,-28(%edi)
leal -32(%esi),%esi /* use leal not to clobber carry */
leal -32(%edi),%edi
decl %ebp
jnz LLoop
LLend: popl %ebp
sbbl %eax,%eax /* save carry in %eax */
andl $7,%ebp
jz LLend2
addl %eax,%eax /* restore carry from eax */
LLoop2: movl %edx,%ebx
movl (%esi),%edx
rcrl $1,%edx
movl %ebx,(%edi)
leal -4(%esi),%esi /* use leal not to clobber carry */
leal -4(%edi),%edi
decl %ebp
jnz LLoop2
jmp LL1
LLend2: addl %eax,%eax /* restore carry from eax */
LL1: movl %edx,(%edi) /* store last limb */
movl $0,%eax
rcrl $1,%eax
popl %ebp
popl %ebx
popl %esi
popl %edi
ret

334
sysdeps/i386/i586/strchr.S Normal file
View File

@ -0,0 +1,334 @@
/* strchr -- find character CH in a NUL terminated string.
Highly optimized version for ix85, x>=5.
Copyright (C) 1995 Free Software Foundation, Inc.
This file is part of the GNU C Library.
Contributed by Ulrich Drepper, <drepper@gnu.ai.mit.edu>.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Library General Public License as
published by the Free Software Foundation; either version 2 of the
License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Library General Public License for more details.
You should have received a copy of the GNU Library General Public
License along with the GNU C Library; see the file COPYING.LIB. If
not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
Boston, MA 02111-1307, USA. */
#include <sysdep.h>
/* This version is especially optimized for the i586 (and following?)
processors. This is mainly done by using the two pipelines. The
version optimized for i486 is weak in this aspect because to get
as much parallelism we have to executs some *more* instructions.
The code below is structured to reflect the pairing of the instructions
as *I think* it is. I have no processor data book to verify this.
If you find something you think is incorrect let me know. */
/* The magic value which is used throughout in the whole code. */
#define magic 0xfefefeff
/*
INPUT PARAMETERS:
str (sp + 4)
ch (sp + 8)
*/
.text
ENTRY (strchr)
pushl %edi /* Save callee-safe registers. */
pushl %esi
pushl %ebx
pushl %ebp
movl 20(%esp), %eax /* get string pointer */
movl 24(%esp), %edx /* get character we are looking for */
movl %eax, %edi /* duplicate string pointer for later */
xorl %ecx, %ecx /* clear %ecx */
/* At the moment %edx contains C. What we need for the
algorithm is C in all bytes of the dword. Avoid
operations on 16 bit words because these require an
prefix byte (and one more cycle). */
movb %dl, %dh /* now it is 0|0|c|c */
movb %dl, %cl /* we construct the lower half in %ecx */
shll $16, %edx /* now %edx is c|c|0|0 */
movb %cl, %ch /* now %ecx is 0|0|c|c */
orl %ecx, %edx /* and finally c|c|c|c */
andl $3, %edi /* mask alignment bits */
jz L11 /* alignment is 0 => start loop */
movb (%eax), %cl /* load single byte */
cmpb %cl, %dl /* is byte == C? */
je L2 /* aligned => return pointer */
cmp $0, %cl /* is byte NUL? */
je L3 /* yes => return NULL */
incl %eax /* increment pointer */
cmp $3, %edi /* was alignment == 3? */
je L11 /* yes => start loop */
movb (%eax), %cl /* load single byte */
cmpb %cl, %dl /* is byte == C? */
je L2 /* aligned => return pointer */
cmp $0, %cl /* is byte NUL? */
je L3 /* yes => return NULL */
incl %eax /* increment pointer */
cmp $2, %edi /* was alignment == 2? */
je L11 /* yes => start loop */
movb (%eax), %cl /* load single byte */
cmpb %cl, %dl /* is byte == C? */
je L2 /* aligned => return pointer */
cmp $0, %cl /* is byte NUL? */
je L3 /* yes => return NULL */
incl %eax /* increment pointer */
/* The following code is the preparation for the loop. The
four instruction up to `L1' will not be executed in the loop
because the same code is found at the end of the loop, but
there it is executed in parallel with other instructions. */
L11: movl (%eax), %ecx
movl $magic, %ebp
movl $magic, %edi
addl %ecx, %ebp
/* The main loop: it looks complex and indeed it is. I would
love to say `it was hard to write, so it should he hard to
read' but I will give some more hints. To fully understand
this code you should first take a look at the i486 version.
The basic algorithm is the same, but here the code organized
in a way which permits to use both pipelines all the time.
I tried to make it a bit more understandable by indenting
the code according to stage in the algorithm. It goes as
follows:
check for 0 in 1st word
check for C in 1st word
check for 0 in 2nd word
check for C in 2nd word
check for 0 in 3rd word
check for C in 3rd word
check for 0 in 4th word
check for C in 4th word
Please note that doing the test for NUL before the test for
C allows us to overlap the test for 0 in the next word with
the test for C. */
L1: xorl %ecx, %ebp /* (word^magic) */
addl %ecx, %edi /* add magic word */
leal 4(%eax), %eax /* increment pointer */
jnc L4 /* previous addl caused overflow? */
movl %ecx, %ebx /* duplicate original word */
orl $magic, %ebp /* (word^magic)|magic */
addl $1, %ebp /* (word^magic)|magic == 0xffffffff? */
jne L4 /* yes => we found word with NUL */
movl $magic, %esi /* load magic value */
xorl %edx, %ebx /* clear words which are C */
movl (%eax), %ecx
addl %ebx, %esi /* (word+magic) */
movl $magic, %edi
jnc L5 /* previous addl caused overflow? */
movl %edi, %ebp
xorl %ebx, %esi /* (word+magic)^word */
addl %ecx, %ebp
orl $magic, %esi /* ((word+magic)^word)|magic */
addl $1, %esi /* ((word+magic)^word)|magic==0xf..f?*/
jne L5 /* yes => we found word with C */
xorl %ecx, %ebp
addl %ecx, %edi
leal 4(%eax), %eax
jnc L4
movl %ecx, %ebx
orl $magic, %ebp
addl $1, %ebp
jne L4
movl $magic, %esi
xorl %edx, %ebx
movl (%eax), %ecx
addl %ebx, %esi
movl $magic, %edi
jnc L5
movl %edi, %ebp
xorl %ebx, %esi
addl %ecx, %ebp
orl $magic, %esi
addl $1, %esi
jne L5
xorl %ecx, %ebp
addl %ecx, %edi
leal 4(%eax), %eax
jnc L4
movl %ecx, %ebx
orl $magic, %ebp
addl $1, %ebp
jne L4
movl $magic, %esi
xorl %edx, %ebx
movl (%eax), %ecx
addl %ebx, %esi
movl $magic, %edi
jnc L5
movl %edi, %ebp
xorl %ebx, %esi
addl %ecx, %ebp
orl $magic, %esi
addl $1, %esi
jne L5
xorl %ecx, %ebp
addl %ecx, %edi
leal 4(%eax), %eax
jnc L4
movl %ecx, %ebx
orl $magic, %ebp
addl $1, %ebp
jne L4
movl $magic, %esi
xorl %edx, %ebx
movl (%eax), %ecx
addl %ebx, %esi
movl $magic, %edi
jnc L5
movl %edi, %ebp
xorl %ebx, %esi
addl %ecx, %ebp
orl $magic, %esi
addl $1, %esi
je L1
/* We know there is no NUL byte but a C byte in the word.
%ebx contains NUL in this particular byte. */
L5: subl $4, %eax /* adjust pointer */
testb %bl, %bl /* first byte == C? */
jz L2 /* yes => return pointer */
incl %eax /* increment pointer */
testb %bh, %bh /* second byte == C? */
jz L2 /* yes => return pointer */
shrl $16, %ebx /* make upper bytes accessible */
incl %eax /* increment pointer */
cmp $0, %bl /* third byte == C */
je L2 /* yes => return pointer */
incl %eax /* increment pointer */
L2: popl %ebp /* restore saved registers */
popl %ebx
popl %esi
popl %edi
ret
/* We know there is a NUL byte in the word. But we have to test
whether there is an C byte before it in the word. */
L4: subl $4, %eax /* adjust pointer */
cmpb %dl, %cl /* first byte == C? */
je L2 /* yes => return pointer */
cmpb $0, %cl /* first byte == NUL? */
je L3 /* yes => return NULL */
incl %eax /* increment pointer */
cmpb %dl, %ch /* second byte == C? */
je L2 /* yes => return pointer */
cmpb $0, %ch /* second byte == NUL? */
je L3 /* yes => return NULL */
shrl $16, %ecx /* make upper bytes accessible */
incl %eax /* increment pointer */
cmpb %dl, %cl /* third byte == C? */
je L2 /* yes => return pointer */
cmpb $0, %cl /* third byte == NUL? */
je L3 /* yes => return NULL */
incl %eax /* increment pointer */
/* The test four the fourth byte is necessary! */
cmpb %dl, %ch /* fourth byte == C? */
je L2 /* yes => return pointer */
L3: xorl %eax, %eax /* set return value = NULL */
popl %ebp /* restore saved registers */
popl %ebx
popl %esi
popl %edi
ret
#undef index
weak_alias (strchr, index)

185
sysdeps/i386/i586/strlen.S Normal file
View File

@ -0,0 +1,185 @@
/* strlen -- Compute length og NUL terminated string.
Highly optimized version for ix86, x>=5.
Copyright (C) 1995 Free Software Foundation, Inc.
This file is part of the GNU C Library.
Contributed by Ulrich Drepper, <drepper@gnu.ai.mit.edu>.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Library General Public License as
published by the Free Software Foundation; either version 2 of the
License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Library General Public License for more details.
You should have received a copy of the GNU Library General Public
License along with the GNU C Library; see the file COPYING.LIB. If
not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
Boston, MA 02111-1307, USA. */
#include <sysdep.h>
/* This version is especially optimized for the i586 (and following?)
processors. This is mainly done by using the two pipelines. The
version optimized for i486 is weak in this aspect because to get
as much parallelism we have to executs some *more* instructions.
The code below is structured to reflect the pairing of the instructions
as *I think* it is. I have no processor data book to verify this.
If you find something you think is incorrect let me know. */
/* The magic value which is used throughout in the whole code. */
#define magic 0xfefefeff
/*
INPUT PARAMETERS:
str (sp + 4)
*/
.text
ENTRY(strlen)
movl 4(%esp), %eax /* get string pointer */
movl %eax, %ecx /* duplicate it */
andl $3, %ecx /* mask alignment bits */
jz L11 /* aligned => start loop */
cmpb %ch, (%eax) /* is byte NUL? */
je L2 /* yes => return */
incl %eax /* increment pointer */
cmpl $3, %ecx /* was alignment = 3? */
je L11 /* yes => now it is aligned and start loop */
cmpb %ch, (%eax) /* is byte NUL? */
je L2 /* yes => return */
incl %eax /* increment pointer */
cmpl $2, %ecx /* was alignment = 2? */
je L11 /* yes => now it is aligned and start loop */
cmpb %ch, (%eax) /* is byte NUL? */
je L2 /* yes => return */
incl %eax /* increment pointer */
/* We exit the loop if adding MAGIC_BITS to LONGWORD fails to
change any of the hole bits of LONGWORD.
1) Is this safe? Will it catch all the zero bytes?
Suppose there is a byte with all zeros. Any carry bits
propagating from its left will fall into the hole at its
least significant bit and stop. Since there will be no
carry from its most significant bit, the LSB of the
byte to the left will be unchanged, and the zero will be
detected.
2) Is this worthwhile? Will it ignore everything except
zero bytes? Suppose every byte of LONGWORD has a bit set
somewhere. There will be a carry into bit 8. If bit 8
is set, this will carry into bit 16. If bit 8 is clear,
one of bits 9-15 must be set, so there will be a carry
into bit 16. Similarly, there will be a carry into bit
24. If one of bits 24-31 is set, there will be a carry
into bit 32 (=carry flag), so all of the hole bits will
be changed. */
L11: xorl %edx, %edx /* We need %edx == 0 for later */
L1:
movl (%eax), %ecx /* get word (= 4 bytes) in question */
addl $4, %eax /* adjust pointer for *next* word */
subl %ecx, %edx /* first step to negate word */
addl $magic, %ecx /* add magic word */
decl %edx /* complete negation of word */
jnc L3 /* previous addl caused overflow? */
xorl %ecx, %edx /* (word+magic)^word */
subl $magic, %ecx /* undo previous addl to restore word */
andl $~magic, %edx /* any of the carry flags set? */
jne L3 /* yes => determine byte */
movl (%eax), %ecx /* get word (= 4 bytes) in question */
addl $4, %eax /* adjust pointer for *next* word */
subl %ecx, %edx /* first step to negate word */
addl $magic, %ecx /* add magic word */
decl %edx /* complete negation of word */
jnc L3 /* previous addl caused overflow? */
xorl %ecx, %edx /* (word+magic)^word */
subl $magic, %ecx /* undo previous addl to restore word */
andl $~magic, %edx /* any of the carry flags set? */
jne L3 /* yes => determine byte */
movl (%eax), %ecx /* get word (= 4 bytes) in question */
addl $4, %eax /* adjust pointer for *next* word */
subl %ecx, %edx /* first step to negate word */
addl $magic, %ecx /* add magic word */
decl %edx /* complete negation of word */
jnc L3 /* previous addl caused overflow? */
xorl %ecx, %edx /* (word+magic)^word */
subl $magic, %ecx /* undo previous addl to restore word */
andl $~magic, %edx /* any of the carry flags set? */
jne L3 /* yes => determine byte */
movl (%eax), %ecx /* get word (= 4 bytes) in question */
addl $4, %eax /* adjust pointer for *next* word */
subl %ecx, %edx /* first step to negate word */
addl $magic, %ecx /* add magic word */
decl %edx /* wcomplete negation of ord */
jnc L3 /* previous addl caused overflow? */
xorl %ecx, %edx /* (word+magic)^word */
subl $magic, %ecx /* undo previous addl to restore word */
andl $~magic, %edx /* any of the carry flags set? */
je L1 /* no => start loop again */
L3: subl $4, %eax /* correct too early pointer increment */
testb %cl, %cl /* lowest byte NUL? */
jz L2 /* yes => return */
inc %eax /* increment pointer */
testb %ch, %ch /* second byte NUL? */
jz L2 /* yes => return */
shrl $16, %ecx /* make upper bytes accessible */
incl %eax /* increment pointer */
cmpb $0, %cl /* is third byte NUL? */
jz L2 /* yes => return */
incl %eax /* increment pointer */
L2: subl 4(%esp), %eax /* now compute the length as difference
between start and terminating NUL
character */
ret

136
sysdeps/i386/i586/sub_n.S Normal file
View File

@ -0,0 +1,136 @@
/* Pentium __mpn_sub_n -- Subtract two limb vectors of the same length > 0
and store difference in a third limb vector.
Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
it under the terms of the GNU Library General Public License as published by
the Free Software Foundation; either version 2 of the License, or (at your
option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
License for more details.
You should have received a copy of the GNU Library General Public License
along with the GNU MP Library; see the file COPYING.LIB. If not, write to
the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
/*
INPUT PARAMETERS
res_ptr (sp + 4)
s1_ptr (sp + 8)
s2_ptr (sp + 12)
size (sp + 16)
*/
#define r1 %eax
#define r2 %edx
#define src1 %esi
#define src2 %ebp
#define dst %edi
#define x %ebx
#include "sysdep.h"
#include "asm-syntax.h"
.text
ALIGN (3)
.globl C_SYMBOL_NAME(__mpn_sub_n)
C_SYMBOL_NAME(__mpn_sub_n:)
pushl %edi
pushl %esi
pushl %ebx
pushl %ebp
movl 20(%esp),dst /* res_ptr */
movl 24(%esp),src1 /* s1_ptr */
movl 28(%esp),src2 /* s2_ptr */
movl 32(%esp),%ecx /* size */
movl (src2),x
decl %ecx
movl %ecx,r2
shrl $3,%ecx
andl $7,r2
testl %ecx,%ecx /* zero carry flag */
jz Lend
pushl r2
ALIGN (3)
Loop: movl 28(dst),%eax /* fetch destination cache line */
leal 32(dst),dst
L1: movl (src1),r1
movl 4(src1),r2
sbbl x,r1
movl 4(src2),x
sbbl x,r2
movl 8(src2),x
movl r1,-32(dst)
movl r2,-28(dst)
L2: movl 8(src1),r1
movl 12(src1),r2
sbbl x,r1
movl 12(src2),x
sbbl x,r2
movl 16(src2),x
movl r1,-24(dst)
movl r2,-20(dst)
L3: movl 16(src1),r1
movl 20(src1),r2
sbbl x,r1
movl 20(src2),x
sbbl x,r2
movl 24(src2),x
movl r1,-16(dst)
movl r2,-12(dst)
L4: movl 24(src1),r1
movl 28(src1),r2
sbbl x,r1
movl 28(src2),x
sbbl x,r2
movl 32(src2),x
movl r1,-8(dst)
movl r2,-4(dst)
leal 32(src1),src1
leal 32(src2),src2
decl %ecx
jnz Loop
popl r2
Lend:
decl r2 /* test r2 w/o clobbering carry */
js Lend2
incl r2
Loop2:
leal 4(dst),dst
movl (src1),r1
sbbl x,r1
movl 4(src2),x
movl r1,-4(dst)
leal 4(src1),src1
leal 4(src2),src2
decl r2
jnz Loop2
Lend2:
movl (src1),r1
sbbl x,r1
movl r1,(dst)
sbbl %eax,%eax
negl %eax
popl %ebp
popl %ebx
popl %esi
popl %edi
ret

View File

@ -0,0 +1,82 @@
/* Pentium __mpn_submul_1 -- Multiply a limb vector with a limb and subtract
the result from a second limb vector.
Copyright (C) 1992, 1994 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
it under the terms of the GNU Library General Public License as published by
the Free Software Foundation; either version 2 of the License, or (at your
option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
License for more details.
You should have received a copy of the GNU Library General Public License
along with the GNU MP Library; see the file COPYING.LIB. If not, write to
the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
/*
INPUT PARAMETERS
res_ptr (sp + 4)
s1_ptr (sp + 8)
size (sp + 12)
s2_limb (sp + 16)
*/
#include "sysdep.h"
#include "asm-syntax.h"
#define res_ptr edi
#define s1_ptr esi
#define size ecx
#define s2_limb ebp
TEXT
ALIGN (3)
GLOBL C_SYMBOL_NAME(__mpn_submul_1)
C_SYMBOL_NAME(__mpn_submul_1:)
INSN1(push,l ,R(edi))
INSN1(push,l ,R(esi))
INSN1(push,l ,R(ebx))
INSN1(push,l ,R(ebp))
INSN2(mov,l ,R(res_ptr),MEM_DISP(esp,20))
INSN2(mov,l ,R(s1_ptr),MEM_DISP(esp,24))
INSN2(mov,l ,R(size),MEM_DISP(esp,28))
INSN2(mov,l ,R(s2_limb),MEM_DISP(esp,32))
INSN2(lea,l ,R(res_ptr),MEM_INDEX(res_ptr,size,4))
INSN2(lea,l ,R(s1_ptr),MEM_INDEX(s1_ptr,size,4))
INSN1(neg,l ,R(size))
INSN2(xor,l ,R(edx),R(edx))
ALIGN (3)
Loop:
INSN2(mov,l ,R(ebx),R(edx))
INSN2(mov,l ,R(eax),MEM_INDEX(s1_ptr,size,4))
INSN1(mul,l ,R(s2_limb))
INSN2(add,l ,R(eax),R(ebx))
INSN2(mov,l ,R(ebx),MEM_INDEX(res_ptr,size,4))
INSN2(adc,l ,R(edx),$0)
INSN2(sub,l ,R(ebx),R(eax))
INSN2(adc,l ,R(edx),$0)
INSN2(mov,l ,MEM_INDEX(res_ptr,size,4),R(ebx))
INSN1(inc,l ,R(size))
INSN1(jnz, ,Loop)
INSN2(mov,l ,R(eax),R(edx))
INSN1(pop,l ,R(ebp))
INSN1(pop,l ,R(ebx))
INSN1(pop,l ,R(esi))
INSN1(pop,l ,R(edi))
ret

315
sysdeps/i386/memchr.S Normal file
View File

@ -0,0 +1,315 @@
/* memchr (str, ch, n) -- Return pointer to first occurrence of CH in STR less
than N.
For Intel 80x86, x>=3.
Copyright (C) 1994, 1995 Free Software Foundation, Inc.
Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>
Optimised a little by Alan Modra <Alan@SPRI.Levels.UniSA.Edu.Au>
This file is part of the GNU C Library.
This version is developed using the same algorithm as the fast C
version which carries the following introduction:
Based on strlen implemention by Torbjorn Granlund (tege@sics.se),
with help from Dan Sahlin (dan@sics.se) and
commentary by Jim Blandy (jimb@ai.mit.edu);
adaptation to memchr suggested by Dick Karpinski (dick@cca.ucsf.edu),
and implemented by Roland McGrath (roland@ai.mit.edu).
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Library General Public License as
published by the Free Software Foundation; either version 2 of the
License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Library General Public License for more details.
You should have received a copy of the GNU Library General Public
License along with the GNU C Library; see the file COPYING.LIB. If
not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
Boston, MA 02111-1307, USA. */
#include <sysdep.h>
#include "asm-syntax.h"
/*
INPUT PARAMETERS:
str (sp + 4)
c (sp + 8)
len (sp + 12)
*/
.text
ENTRY (memchr)
/* Save callee-safe registers used in this function. */
pushl %esi
pushl %edi
/* Load parameters into registers. */
movl 12(%esp), %eax /* str: pointer to memory block. */
movl 16(%esp), %edx /* c: byte we are looking for. */
movl 20(%esp), %esi /* len: length of memory block. */
/* If my must not test more than three characters test
them one by one. This is especially true for 0. */
cmpl $4, %esi
jb L3
/* At the moment %edx contains C. What we need for the
algorithm is C in all bytes of the dword. Avoid
operations on 16 bit words because these require an
prefix byte (and one more cycle). */
movb %dl, %dh /* Now it is 0|0|c|c */
movl %edx, %ecx
shll $16, %edx /* Now c|c|0|0 */
movw %cx, %dx /* And finally c|c|c|c */
/* Better performance can be achieved if the word (32
bit) memory access is aligned on a four-byte-boundary.
So process first bytes one by one until boundary is
reached. Don't use a loop for better performance. */
testb $3, %eax /* correctly aligned ? */
je L2 /* yes => begin loop */
cmpb %dl, (%eax) /* compare byte */
je L9 /* target found => return */
incl %eax /* increment source pointer */
decl %esi /* decrement length counter */
je L4 /* len==0 => return NULL */
testb $3, %eax /* correctly aligned ? */
je L2 /* yes => begin loop */
cmpb %dl, (%eax) /* compare byte */
je L9 /* target found => return */
incl %eax /* increment source pointer */
decl %esi /* decrement length counter */
je L4 /* len==0 => return NULL */
testb $3, %eax /* correctly aligned ? */
je L2 /* yes => begin loop */
cmpb %dl, (%eax) /* compare byte */
je L9 /* target found => return */
incl %eax /* increment source pointer */
decl %esi /* decrement length counter */
/* no test for len==0 here, because this is done in the
loop head */
jmp L2
/* We exit the loop if adding MAGIC_BITS to LONGWORD fails to
change any of the hole bits of LONGWORD.
1) Is this safe? Will it catch all the zero bytes?
Suppose there is a byte with all zeros. Any carry bits
propagating from its left will fall into the hole at its
least significant bit and stop. Since there will be no
carry from its most significant bit, the LSB of the
byte to the left will be unchanged, and the zero will be
detected.
2) Is this worthwhile? Will it ignore everything except
zero bytes? Suppose every byte of LONGWORD has a bit set
somewhere. There will be a carry into bit 8. If bit 8
is set, this will carry into bit 16. If bit 8 is clear,
one of bits 9-15 must be set, so there will be a carry
into bit 16. Similarly, there will be a carry into bit
24. If one of bits 24-31 is set, there will be a carry
into bit 32 (=carry flag), so all of the hole bits will
be changed.
3) But wait! Aren't we looking for C, not zero?
Good point. So what we do is XOR LONGWORD with a longword,
each of whose bytes is C. This turns each byte that is C
into a zero. */
/* Each round the main loop processes 16 bytes. */
ALIGN (4)
L1: movl (%eax), %ecx /* get word (= 4 bytes) in question */
movl $0xfefefeff, %edi /* magic value */
xorl %edx, %ecx /* XOR with word c|c|c|c => bytes of str == c
are now 0 */
addl %ecx, %edi /* add the magic value to the word. We get
carry bits reported for each byte which
is *not* 0 */
/* According to the algorithm we had to reverse the effect of the
XOR first and then test the overflow bits. But because the
following XOR would destroy the carry flag and it would (in a
representation with more than 32 bits) not alter then last
overflow, we can now test this condition. If no carry is signaled
no overflow must have occured in the last byte => it was 0. */
jnc L8
/* We are only interested in carry bits that change due to the
previous add, so remove original bits */
xorl %ecx, %edi /* ((word^charmask)+magic)^(word^charmask) */
/* Now test for the other three overflow bits. */
orl $0xfefefeff, %edi /* set all non-carry bits */
incl %edi /* add 1: if one carry bit was *not* set
the addition will not result in 0. */
/* If at least one byte of the word is C we don't get 0 in %edi. */
jnz L8 /* found it => return pointer */
/* This process is unfolded four times for better performance.
we don't increment the source pointer each time. Instead we
use offsets and increment by 16 in each run of the loop. But
before probing for the matching byte we need some extra code
(following LL(13) below). Even the len can be compared with
constants instead of decrementing each time. */
movl 4(%eax), %ecx /* get word (= 4 bytes) in question */
movl $0xfefefeff, %edi /* magic value */
xorl %edx, %ecx /* XOR with word c|c|c|c => bytes of str == c
are now 0 */
addl %ecx, %edi /* add the magic value to the word. We get
carry bits reported for each byte which
is *not* 0 */
jnc L7 /* highest byte is C => return pointer */
xorl %ecx, %edi /* ((word^charmask)+magic)^(word^charmask) */
orl $0xfefefeff, %edi /* set all non-carry bits */
incl %edi /* add 1: if one carry bit was *not* set
the addition will not result in 0. */
jnz L7 /* found it => return pointer */
movl 8(%eax), %ecx /* get word (= 4 bytes) in question */
movl $0xfefefeff, %edi /* magic value */
xorl %edx, %ecx /* XOR with word c|c|c|c => bytes of str == c
are now 0 */
addl %ecx, %edi /* add the magic value to the word. We get
carry bits reported for each byte which
is *not* 0 */
jnc L6 /* highest byte is C => return pointer */
xorl %ecx, %edi /* ((word^charmask)+magic)^(word^charmask) */
orl $0xfefefeff, %edi /* set all non-carry bits */
incl %edi /* add 1: if one carry bit was *not* set
the addition will not result in 0. */
jnz L6 /* found it => return pointer */
movl 12(%eax), %ecx /* get word (= 4 bytes) in question */
movl $0xfefefeff, %edi /* magic value */
xorl %edx, %ecx /* XOR with word c|c|c|c => bytes of str == c
are now 0 */
addl %ecx, %edi /* add the magic value to the word. We get
carry bits reported for each byte which
is *not* 0 */
jnc L5 /* highest byte is C => return pointer */
xorl %ecx, %edi /* ((word^charmask)+magic)^(word^charmask) */
orl $0xfefefeff, %edi /* set all non-carry bits */
incl %edi /* add 1: if one carry bit was *not* set
the addition will not result in 0. */
jnz L5 /* found it => return pointer */
/* Adjust both counters for a full round, i.e. 16 bytes. */
addl $16, %eax
L2: subl $16, %esi
jae L1 /* Still more than 16 bytes remaining */
/* Process remaining bytes separately. */
cmpl $4-16, %esi /* rest < 4 bytes? */
jb L3 /* yes, than test byte by byte */
movl (%eax), %ecx /* get word (= 4 bytes) in question */
movl $0xfefefeff, %edi /* magic value */
xorl %edx, %ecx /* XOR with word c|c|c|c => bytes of str == c
are now 0 */
addl %ecx, %edi /* add the magic value to the word. We get
carry bits reported for each byte which
is *not* 0 */
jnc L8 /* highest byte is C => return pointer */
xorl %ecx, %edi /* ((word^charmask)+magic)^(word^charmask) */
orl $0xfefefeff, %edi /* set all non-carry bits */
incl %edi /* add 1: if one carry bit was *not* set
the addition will not result in 0. */
jne L8 /* found it => return pointer */
addl $4, %eax /* adjust source pointer */
cmpl $8-16, %esi /* rest < 8 bytes? */
jb L3 /* yes, than test byte by byte */
movl (%eax), %ecx /* get word (= 4 bytes) in question */
movl $0xfefefeff, %edi /* magic value */
xorl %edx, %ecx /* XOR with word c|c|c|c => bytes of str == c
are now 0 */
addl %ecx, %edi /* add the magic value to the word. We get
carry bits reported for each byte which
is *not* 0 */
jnc L8 /* highest byte is C => return pointer */
xorl %ecx, %edi /* ((word^charmask)+magic)^(word^charmask) */
orl $0xfefefeff, %edi /* set all non-carry bits */
incl %edi /* add 1: if one carry bit was *not* set
the addition will not result in 0. */
jne L8 /* found it => return pointer */
addl $4, %eax /* adjust source pointer */
cmpl $12-16, %esi /* rest < 12 bytes? */
jb L3 /* yes, than test byte by byte */
movl (%eax), %ecx /* get word (= 4 bytes) in question */
movl $0xfefefeff, %edi /* magic value */
xorl %edx, %ecx /* XOR with word c|c|c|c => bytes of str == c
are now 0 */
addl %ecx, %edi /* add the magic value to the word. We get
carry bits reported for each byte which
is *not* 0 */
jnc L8 /* highest byte is C => return pointer */
xorl %ecx, %edi /* ((word^charmask)+magic)^(word^charmask) */
orl $0xfefefeff, %edi /* set all non-carry bits */
incl %edi /* add 1: if one carry bit was *not* set
the addition will not result in 0. */
jne L8 /* found it => return pointer */
addl $4, %eax /* adjust source pointer */
/* Check the remaining bytes one by one. */
L3: andl $3, %esi /* mask out uninteresting bytes */
jz L4 /* no remaining bytes => return NULL */
cmpb %dl, (%eax) /* compare byte with C */
je L9 /* equal, than return pointer */
incl %eax /* increment source pointer */
decl %esi /* decrement length */
jz L4 /* no remaining bytes => return NULL */
cmpb %dl, (%eax) /* compare byte with C */
je L9 /* equal, than return pointer */
incl %eax /* increment source pointer */
decl %esi /* decrement length */
jz L4 /* no remaining bytes => return NULL */
cmpb %dl, (%eax) /* compare byte with C */
je L9 /* equal, than return pointer */
L4: /* no byte found => return NULL */
xorl %eax, %eax
jmp L9
/* add missing source pointer increments */
L5: addl $4, %eax
L6: addl $4, %eax
L7: addl $4, %eax
/* Test for the matching byte in the word. %ecx contains a NUL
char in the byte which originally was the byte we are looking
at. */
L8: testb %cl, %cl /* test first byte in dword */
jz L9 /* if zero => return pointer */
incl %eax /* increment source pointer */
testb %ch, %ch /* test second byte in dword */
jz L9 /* if zero => return pointer */
incl %eax /* increment source pointer */
testl $0xff0000, %ecx /* test third byte in dword */
jz L9 /* if zero => return pointer */
incl %eax /* increment source pointer */
/* No further test needed we we known it is one of the four byytes. */
L9: popl %edi /* pop saved registers */
popl %esi
ret

View File

@ -1,48 +0,0 @@
/* memchr (str, ch, n) -- Return pointer to first occurrence of CH in STR less
than N.
For Intel 80x86, x>=3.
Copyright (C) 1991, 1992, 1993 Free Software Foundation, Inc.
Contributed by Torbjorn Granlund (tege@sics.se).
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Library General Public License as
published by the Free Software Foundation; either version 2 of the
License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Library General Public License for more details.
You should have received a copy of the GNU Library General Public
License along with the GNU C Library; see the file COPYING.LIB. If
not, write to the Free Software Foundation, Inc., 675 Mass Ave,
Cambridge, MA 02139, USA. */
#include <ansidecl.h>
#include <string.h>
#ifdef __GNUC__
PTR
DEFUN(memchr, (str, c, len),
CONST PTR str AND int c AND size_t len)
{
PTR retval;
asm("cld\n" /* Search forward. */
"testl %1,%1\n" /* Clear Z flag, to handle LEN == 0. */
/* Some old versions of gas need `repne' instead of `repnz'. */
"repnz\n" /* Search for C in al. */
"scasb\n"
"movl %2,%0\n" /* Set %0 to 0 (without affecting Z flag). */
"jnz done\n" /* Jump if we found nothing equal to C. */
"leal -1(%1),%0\n" /* edi has been incremented. Return edi-1. */
"done:" :
"=a" (retval), "=D" (str), "=c" (len) :
"0" (c), "1" (str), "2" (len));
return retval;
}
#else
#include <sysdeps/generic/memchr.c>
#endif

68
sysdeps/i386/memcmp.S Normal file
View File

@ -0,0 +1,68 @@
/* memcmp -- compare two memory blocks for differences in the first COUNT
bytes.
Copyright (C) 1995 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Library General Public License as
published by the Free Software Foundation; either version 2 of the
License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Library General Public License for more details.
You should have received a copy of the GNU Library General Public
License along with the GNU C Library; see the file COPYING.LIB. If
not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
Boston, MA 02111-1307, USA. */
#include <sysdep.h>
#include "asm-syntax.h"
/*
INPUT PARAMETERS:
block1 (sp + 4)
block2 (sp + 8)
len (sp + 12)
*/
.text
ENTRY (memcmp)
pushl %esi /* Save callee-safe registers. */
movl %edi, %edx /* Note that %edx is not used and can
so be used to save %edi. It's faster. */
movl 12(%esp), %esi /* Load address of block #1. */
movl 16(%esp), %edi /* Load address of block #2. */
movl 20(%esp), %ecx /* Load maximal length of compare area. */
cld /* Set direction of comparison. */
xorl %eax, %eax /* Default result. */
repe /* Compare at most %ecx bytes. */
cmpsb
jz L1 /* If even last byte was equal we return 0. */
/* The memory blocks are not equal. So result of the last
subtraction is present in the carry flag. It is set when
the byte in block #2 is bigger. In this case we have to
return -1 (=0xffffffff), else 1. */
sbbl %eax, %eax /* This is tricky. %eax == 0 and carry is set
or not depending on last subtraction. */
/* At this point %eax == 0, if the byte of block #1 was bigger, and
0xffffffff if the last byte of block #2 was bigger. The later
case is already correct but the former needs a little adjustment.
Note that the following operation does not change 0xffffffff. */
orb $1, %al /* Change 0 to 1. */
L1: popl %esi /* Restore registers. */
movl %edx, %edi
ret
#undef bcmp
weak_alias (memcmp, bcmp)

87
sysdeps/i386/stpcpy.S Normal file
View File

@ -0,0 +1,87 @@
/* stpcpy -- copy SRC to DEST returning the address of the terminating '\0'
in DEST.
For Intel 80x86, x>=3.
Copyright (C) 1994, 1995 Free Software Foundation, Inc.
Contributed by Ulrich Drepper (drepper@gnu.ai.mit.edu).
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Library General Public License as
published by the Free Software Foundation; either version 2 of the
License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Library General Public License for more details.
You should have received a copy of the GNU Library General Public
License along with the GNU C Library; see the file COPYING.LIB. If
not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
Boston, MA 02111-1307, USA. */
/* This function is defined neither in ANSI nor POSIX standards but is
also not invented here. */
#include <sysdep.h>
#include "asm-syntax.h"
/*
INPUT PARAMETERS:
dest (sp + 4)
src (sp + 8)
*/
.text
ENTRY (__stpcpy)
movl 4(%esp), %eax /* load destination pointer */
movl 8(%esp), %ecx /* load source pointer */
subl %eax, %ecx /* magic: reduce number of loop variants
to one using addressing mode */
/* Here we would like to write
subl $4, %eax
ALIGN (4)
but the assembler is too smart and optimizes for the shortest
form where the number only needs one byte. But if we could
have the long form we would not need the alignment. */
.byte 0x81, 0xe8 /* This is `subl $0x00000004, %eax' */
.long 0x00000004
/* Four times unfolded loop with only one loop counter. This
is achieved by the use of index+base adressing mode. As the
loop counter we use the destination address because this is
also the result. */
L1: addl $4, %eax /* increment loop counter */
movb (%eax,%ecx), %dl /* load current char */
movb %dl, (%eax) /* and store it */
testb %dl, %dl /* was it NUL? */
jz L2 /* yes, then exit */
movb 1(%eax,%ecx), %dl /* load current char */
movb %dl, 1(%eax) /* and store it */
testb %dl, %dl /* was it NUL? */
jz L3 /* yes, then exit */
movb 2(%eax,%ecx), %dl /* load current char */
movb %dl, 2(%eax) /* and store it */
testb %dl, %dl /* was it NUL? */
jz L4 /* yes, then exit */
movb 3(%eax,%ecx), %dl /* load current char */
movb %dl, 3(%eax) /* and store it */
testb %dl, %dl /* was it NUL? */
jnz L1 /* no, then continue loop */
incl %eax /* correct loop counter */
L4: incl %eax
L3: incl %eax
L2:
ret
weak_alias (__stpcpy, stpcpy)

143
sysdeps/i386/stpncpy.S Normal file
View File

@ -0,0 +1,143 @@
/* stpncpy -- copy no more then N bytes from SRC to DEST, returning the
address of the terminating '\0' in DEST.
For Intel 80x86, x>=3.
Copyright (C) 1994, 1995 Free Software Foundation, Inc.
Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>
Some bug fixes by Alan Modra <Alan@SPRI.Levels.UniSA.Edu.Au>
- original wrote n+1 chars in some cases.
- stpncpy() ought to behave like strncpy() ie. not null-terminate
if limited by n. glibc-1.09 stpncpy() does this.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Library General Public License as
published by the Free Software Foundation; either version 2 of the
License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Library General Public License for more details.
You should have received a copy of the GNU Library General Public
License along with the GNU C Library; see the file COPYING.LIB. If
not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
Boston, MA 02111-1307, USA. */
#include <sysdep.h>
#include "asm-syntax.h"
/*
INPUT PARAMETERS:
dest (sp + 4)
src (sp + 8)
maxlen (sp + 12)
*/
.text
ENTRY (__stpncpy)
pushl %esi
movl 8(%esp), %eax /* load destination pointer */
movl 12(%esp), %esi /* load source pointer */
movl 16(%esp), %ecx /* load maximal length */
subl %eax, %esi /* magic: reduce number of loop variants
to one using addressing mode */
jmp L1 /* jump to loop "head" */
ALIGN(4)
/* Four times unfolded loop with two loop counters. We get the
the third value (the source address) by using the index+base
adressing mode. */
L2: movb (%eax,%esi), %dl /* load current char */
movb %dl, (%eax) /* and store it */
testb %dl, %dl /* was it NUL? */
jz L7 /* yes, then exit */
movb 1(%eax,%esi), %dl /* load current char */
movb %dl, 1(%eax) /* and store it */
testb %dl, %dl /* was it NUL? */
jz L6 /* yes, then exit */
movb 2(%eax,%esi), %dl /* load current char */
movb %dl, 2(%eax) /* and store it */
testb %dl, %dl /* was it NUL? */
jz L5 /* yes, then exit */
movb 3(%eax,%esi), %dl /* load current char */
movb %dl, 3(%eax) /* and store it */
testb %dl, %dl /* was it NUL? */
jz L4 /* yes, then exit */
addl $4, %eax /* increment loop counter for full round */
L1: subl $4, %ecx /* still more than 4 bytes allowed? */
jae L2 /* yes, then go to start of loop */
/* The maximal remaining 15 bytes are not processed in a loop. */
addl $4, %ecx /* correct above subtraction */
jz L9 /* maximal allowed char reached => go to end */
movb (%eax,%esi), %dl /* load current char */
movb %dl, (%eax) /* and store it */
testb %dl, %dl /* was it NUL? */
jz L3 /* yes, then exit */
incl %eax /* increment pointer */
decl %ecx /* decrement length counter */
jz L9 /* no more allowed => exit */
movb (%eax,%esi), %dl /* load current char */
movb %dl, (%eax) /* and store it */
testb %dl, %dl /* was it NUL? */
jz L3 /* yes, then exit */
incl %eax /* increment pointer */
decl %ecx /* decrement length counter */
jz L9 /* no more allowed => exit */
movb (%eax,%esi), %dl /* load current char */
movb %dl, (%eax) /* and store it */
testb %dl, %dl /* was it NUL? */
jz L3 /* yes, then exit */
incl %eax /* increment pointer */
jmp L9 /* we don't have to test for counter underflow
because we know we had a most 3 bytes
remaining => exit */
/* When coming from the main loop we have to adjust the pointer. */
L4: decl %ecx /* decrement counter */
incl %eax /* increment pointer */
L5: decl %ecx /* increment pointer */
incl %eax /* increment pointer */
L6: decl %ecx /* increment pointer */
incl %eax /* increment pointer */
L7:
addl $3, %ecx /* correct pre-decrementation of counter
at the beginning of the loop; but why 3
and not 4? Very simple, we have to count
the NUL char we already wrote. */
jz L9 /* counter is also 0 => exit */
/* We now have to fill the rest of the buffer with NUL. This
is done in a tricky way. Please note that the adressing mode
used below is not the same we used above. Here we use the
%ecx register. */
L8:
movb $0, (%ecx,%eax) /* store NUL char */
L3: decl %ecx /* all bytes written? */
jnz L8 /* no, then again */
L9: popl %esi /* restore saved register content */
ret
weak_alias (__stpncpy, stpncpy)

278
sysdeps/i386/strchr.S Normal file
View File

@ -0,0 +1,278 @@
/* strchr (str, ch) -- Return pointer to first occurrence of CH in STR.
For Intel 80x86, x>=3.
Copyright (C) 1994, 1995 Free Software Foundation, Inc.
Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>
Some optimisations by Alan Modra <Alan@SPRI.Levels.UniSA.Edu.Au>
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Library General Public License as
published by the Free Software Foundation; either version 2 of the
License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Library General Public License for more details.
You should have received a copy of the GNU Library General Public
License along with the GNU C Library; see the file COPYING.LIB. If
not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
Boston, MA 02111-1307, USA. */
#include <sysdep.h>
#include "asm-syntax.h"
/*
INPUT PARAMETERS:
str (sp + 4)
ch (sp + 8)
*/
.text
ENTRY (strchr)
pushl %edi /* Save callee-safe registers used here. */
movl 8(%esp), %eax /* get string pointer */
movl 12(%esp), %edx /* get character we are looking for */
/* At the moment %edx contains C. What we need for the
algorithm is C in all bytes of the dword. Avoid
operations on 16 bit words because these require an
prefix byte (and one more cycle). */
movb %dl, %dh /* now it is 0|0|c|c */
movl %edx, %ecx
shll $16, %edx /* now it is c|c|0|0 */
movw %cx, %dx /* and finally c|c|c|c */
/* Before we start with the main loop we process single bytes
until the source pointer is aligned. This has two reasons:
1. aligned 32-bit memory access is faster
and (more important)
2. we process in the main loop 32 bit in one step although
we don't know the end of the string. But accessing at
4-byte alignment guarantees that we never access illegal
memory if this would not also be done by the trivial
implementation (this is because all processor inherant
boundaries are multiples of 4. */
testb $3, %eax /* correctly aligned ? */
jz L11 /* yes => begin loop */
movb (%eax), %cl /* load byte in question (we need it twice) */
cmpb %cl, %dl /* compare byte */
je L6 /* target found => return */
testb %cl, %cl /* is NUL? */
jz L2 /* yes => return NULL */
incl %eax /* increment pointer */
testb $3, %eax /* correctly aligned ? */
jz L11 /* yes => begin loop */
movb (%eax), %cl /* load byte in question (we need it twice) */
cmpb %cl, %dl /* compare byte */
je L6 /* target found => return */
testb %cl, %cl /* is NUL? */
jz L2 /* yes => return NULL */
incl %eax /* increment pointer */
testb $3, %eax /* correctly aligned ? */
jz L11 /* yes => begin loop */
movb (%eax), %cl /* load byte in question (we need it twice) */
cmpb %cl, %dl /* compare byte */
je L6 /* target found => return */
testb %cl, %cl /* is NUL? */
jz L2 /* yes => return NULL */
incl %eax /* increment pointer */
/* No we have reached alignment. */
jmp L11 /* begin loop */
/* We exit the loop if adding MAGIC_BITS to LONGWORD fails to
change any of the hole bits of LONGWORD.
1) Is this safe? Will it catch all the zero bytes?
Suppose there is a byte with all zeros. Any carry bits
propagating from its left will fall into the hole at its
least significant bit and stop. Since there will be no
carry from its most significant bit, the LSB of the
byte to the left will be unchanged, and the zero will be
detected.
2) Is this worthwhile? Will it ignore everything except
zero bytes? Suppose every byte of LONGWORD has a bit set
somewhere. There will be a carry into bit 8. If bit 8
is set, this will carry into bit 16. If bit 8 is clear,
one of bits 9-15 must be set, so there will be a carry
into bit 16. Similarly, there will be a carry into bit
24. If one of bits 24-31 is set, there will be a carry
into bit 32 (=carry flag), so all of the hole bits will
be changed.
3) But wait! Aren't we looking for C, not zero?
Good point. So what we do is XOR LONGWORD with a longword,
each of whose bytes is C. This turns each byte that is C
into a zero. */
/* Each round the main loop processes 16 bytes. */
ALIGN(4)
L1: addl $16, %eax /* adjust pointer for whole round */
L11: movl (%eax), %ecx /* get word (= 4 bytes) in question */
xorl %edx, %ecx /* XOR with word c|c|c|c => bytes of str == c
are now 0 */
movl $0xfefefeff, %edi /* magic value */
addl %ecx, %edi /* add the magic value to the word. We get
carry bits reported for each byte which
is *not* C */
/* According to the algorithm we had to reverse the effect of the
XOR first and then test the overflow bits. But because the
following XOR would destroy the carry flag and it would (in a
representation with more than 32 bits) not alter then last
overflow, we can now test this condition. If no carry is signaled
no overflow must have occured in the last byte => it was 0. */
jnc L7
/* We are only interested in carry bits that change due to the
previous add, so remove original bits */
xorl %ecx, %edi /* ((word^charmask)+magic)^(word^charmask) */
/* Now test for the other three overflow bits. */
orl $0xfefefeff, %edi /* set all non-carry bits */
incl %edi /* add 1: if one carry bit was *not* set
the addition will not result in 0. */
/* If at least one byte of the word is C we don't get 0 in %edi. */
jnz L7 /* found it => return pointer */
/* Now we made sure the dword does not contain the character we are
looking for. But because we deal with strings we have to check
for the end of string before testing the next dword. */
xorl %edx, %ecx /* restore original dword without reload */
movl $0xfefefeff, %edi /* magic value */
addl %ecx, %edi /* add the magic value to the word. We get
carry bits reported for each byte which
is *not* 0 */
jnc L2 /* highest byte is NUL => return NULL */
xorl %ecx, %edi /* (word+magic)^word */
orl $0xfefefeff, %edi /* set all non-carry bits */
incl %edi /* add 1: if one carry bit was *not* set
the addition will not result in 0. */
jnz L2 /* found NUL => return NULL */
movl 4(%eax), %ecx /* get word (= 4 bytes) in question */
xorl %edx, %ecx /* XOR with word c|c|c|c => bytes of str == c
are now 0 */
movl $0xfefefeff, %edi /* magic value */
addl %ecx, %edi /* add the magic value to the word. We get
carry bits reported for each byte which
is *not* C */
jnc L71 /* highest byte is C => return pointer */
xorl %ecx, %edi /* ((word^charmask)+magic)^(word^charmask) */
orl $0xfefefeff, %edi /* set all non-carry bits */
incl %edi /* add 1: if one carry bit was *not* set
the addition will not result in 0. */
jnz L71 /* found it => return pointer */
xorl %edx, %ecx /* restore original dword without reload */
movl $0xfefefeff, %edi /* magic value */
addl %ecx, %edi /* add the magic value to the word. We get
carry bits reported for each byte which
is *not* 0 */
jnc L2 /* highest byte is NUL => return NULL */
xorl %ecx, %edi /* (word+magic)^word */
orl $0xfefefeff, %edi /* set all non-carry bits */
incl %edi /* add 1: if one carry bit was *not* set
the addition will not result in 0. */
jnz L2 /* found NUL => return NULL */
movl 8(%eax), %ecx /* get word (= 4 bytes) in question */
xorl %edx, %ecx /* XOR with word c|c|c|c => bytes of str == c
are now 0 */
movl $0xfefefeff, %edi /* magic value */
addl %ecx, %edi /* add the magic value to the word. We get
carry bits reported for each byte which
is *not* C */
jnc L72 /* highest byte is C => return pointer */
xorl %ecx, %edi /* ((word^charmask)+magic)^(word^charmask) */
orl $0xfefefeff, %edi /* set all non-carry bits */
incl %edi /* add 1: if one carry bit was *not* set
the addition will not result in 0. */
jnz L72 /* found it => return pointer */
xorl %edx, %ecx /* restore original dword without reload */
movl $0xfefefeff, %edi /* magic value */
addl %ecx, %edi /* add the magic value to the word. We get
carry bits reported for each byte which
is *not* 0 */
jnc L2 /* highest byte is NUL => return NULL */
xorl %ecx, %edi /* (word+magic)^word */
orl $0xfefefeff, %edi /* set all non-carry bits */
incl %edi /* add 1: if one carry bit was *not* set
the addition will not result in 0. */
jnz L2 /* found NUL => return NULL */
movl 12(%eax), %ecx /* get word (= 4 bytes) in question */
xorl %edx, %ecx /* XOR with word c|c|c|c => bytes of str == c
are now 0 */
movl $0xfefefeff, %edi /* magic value */
addl %ecx, %edi /* add the magic value to the word. We get
carry bits reported for each byte which
is *not* C */
jnc L73 /* highest byte is C => return pointer */
xorl %ecx, %edi /* ((word^charmask)+magic)^(word^charmask) */
orl $0xfefefeff, %edi /* set all non-carry bits */
incl %edi /* add 1: if one carry bit was *not* set
the addition will not result in 0. */
jnz L73 /* found it => return pointer */
xorl %edx, %ecx /* restore original dword without reload */
movl $0xfefefeff, %edi /* magic value */
addl %ecx, %edi /* add the magic value to the word. We get
carry bits reported for each byte which
is *not* 0 */
jnc L2 /* highest byte is NUL => return NULL */
xorl %ecx, %edi /* (word+magic)^word */
orl $0xfefefeff, %edi /* set all non-carry bits */
incl %edi /* add 1: if one carry bit was *not* set
the addition will not result in 0. */
jz L1 /* no NUL found => restart loop */
L2: /* Return NULL. */
xorl %eax, %eax /* load NULL in return value register */
popl %edi /* restore saved register content */
ret
L73: addl $4, %eax /* adjust pointer */
L72: addl $4, %eax
L71: addl $4, %eax
/* We now scan for the byte in which the character was matched.
But we have to take care of the case that a NUL char is
found before this in the dword. */
L7: testb %cl, %cl /* is first byte C? */
jz L6 /* yes => return pointer */
cmpb %dl, %cl /* is first byte NUL? */
je L2 /* yes => return NULL */
incl %eax /* it's not in the first byte */
testb %ch, %ch /* is second byte C? */
jz L6 /* yes => return pointer */
cmpb %dl, %ch /* is second byte NUL? */
je L2 /* yes => return NULL? */
incl %eax /* it's not in the second byte */
shrl $16, %ecx /* make upper byte accessible */
testb %cl, %cl /* is third byte C? */
jz L6 /* yes => return pointer */
cmpb %dl, %cl /* is third byte NUL? */
je L2 /* yes => return NULL */
/* It must be in the fourth byte and it cannot be NUL. */
incl %eax
L6: popl %edi /* restore saved register content */
ret
weak_alias (strchr, index)

176
sysdeps/i386/strcspn.S Normal file
View File

@ -0,0 +1,176 @@
/* strcspn (str, ss) -- Return the length of the initial segement of STR
which contains no characters from SS.
For Intel 80x86, x>=3.
Copyright (C) 1994, 1995 Free Software Foundation, Inc.
Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>
Bug fixes by Alan Modra <Alan@SPRI.Levels.UniSA.Edu.Au>
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Library General Public License as
published by the Free Software Foundation; either version 2 of the
License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Library General Public License for more details.
You should have received a copy of the GNU Library General Public
License along with the GNU C Library; see the file COPYING.LIB. If
not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
Boston, MA 02111-1307, USA. */
#include <sysdep.h>
#include "asm-syntax.h"
/*
INPUT PARAMETERS:
str (sp + 4)
stopset (sp + 8)
*/
.text
ENTRY (strcspn)
movl 4(%esp), %edx /* get string pointer */
movl 8(%esp), %eax /* get stopset pointer */
/* First we create a table with flags for all possible characters.
For the ASCII (7bit/8bit) or ISO-8859-X character sets which are
supported by the C string functions we have 256 characters.
Before inserting marks for the stop characters we clear the whole
table. The unrolled form is much faster than a loop. */
xorl %ecx, %ecx /* %ecx = 0 !!! */
pushl %ecx /* make a 256 bytes long block filled with 0 */
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl $0 /* These immediate values make the label 2 */
pushl $0 /* to be aligned on a 16 byte boundary to */
pushl $0 /* get a better performance of the loop. */
pushl $0
pushl $0
pushl $0
/* For understanding the following code remember that %ecx == 0 now.
Although all the following instruction only modify %cl we always
have a correct zero-extended 32-bit value in %ecx. */
/* Don't change the "testb $0xff,%%cl" to "testb %%cl,%%cl". We want
longer instructions so that the next loop aligns without adding nops. */
L2: movb (%eax), %cl /* get byte from stopset */
testb %cl, %cl /* is NUL char? */
jz L1 /* yes => start compare loop */
movb %cl, (%esp,%ecx) /* set corresponding byte in stopset table */
movb 1(%eax), %cl /* get byte from stopset */
testb $0xff, %cl /* is NUL char? */
jz L1 /* yes => start compare loop */
movb %cl, (%esp,%ecx) /* set corresponding byte in stopset table */
movb 2(%eax), %cl /* get byte from stopset */
testb $0xff, %cl /* is NUL char? */
jz L1 /* yes => start compare loop */
movb %cl, (%esp,%ecx) /* set corresponding byte in stopset table */
movb 3(%eax), %cl /* get byte from stopset */
addl $4, %eax /* increment stopset pointer */
movb %cl, (%esp,%ecx) /* set corresponding byte in stopset table */
testb $0xff, %cl /* is NUL char? */
jnz L2 /* no => process next dword from stopset */
L1: leal -4(%edx), %eax /* prepare loop */
/* We use a neat trick for the following loop. Normally we would
have to test for two termination conditions
1. a character in the stopset was found
and
2. the end of the string was found
But as a sign that the chracter is in the stopset we store its
value in the table. But the value of NUL is NUL so the loop
terminates for NUL in every case. */
L3: addl $4, %eax /* adjust pointer for full loop round */
movb (%eax), %cl /* get byte from string */
cmpb %cl, (%esp,%ecx) /* is it contained in stopset? */
je L4 /* yes => return */
movb 1(%eax), %cl /* get byte from string */
cmpb %cl, (%esp,%ecx) /* is it contained in stopset? */
je L5 /* yes => return */
movb 2(%eax), %cl /* get byte from string */
cmpb %cl, (%esp,%ecx) /* is it contained in stopset? */
je L6 /* yes => return */
movb 3(%eax), %cl /* get byte from string */
cmpb %cl, (%esp,%ecx) /* is it contained in stopset? */
jne L3 /* yes => return */
incl %eax /* adjust pointer */
L6: incl %eax
L5: incl %eax
L4: subl %edx, %eax /* we have to return the number of valid
characters, so compute distance to first
non-valid character */
addl $256, %esp /* remove stopset */
ret

177
sysdeps/i386/strpbrk.S Normal file
View File

@ -0,0 +1,177 @@
/* strcspn (str, ss) -- Return the length of the initial segement of STR
which contains no characters from SS.
For Intel 80x86, x>=3.
Copyright (C) 1994, 1995 Free Software Foundation, Inc.
Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>
Bug fixes by Alan Modra <Alan@SPRI.Levels.UniSA.Edu.Au>
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Library General Public License as
published by the Free Software Foundation; either version 2 of the
License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Library General Public License for more details.
You should have received a copy of the GNU Library General Public
License along with the GNU C Library; see the file COPYING.LIB. If
not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
Boston, MA 02111-1307, USA. */
#include <sysdep.h>
#include "asm-syntax.h"
/*
INPUT PARAMETERS:
str (sp + 4)
stopset (sp + 8)
*/
.text
ENTRY (strpbrk)
movl 4(%esp), %edx /* get string pointer */
movl 8(%esp), %eax /* get stopset pointer */
/* First we create a table with flags for all possible characters.
For the ASCII (7bit/8bit) or ISO-8859-X character sets which are
supported by the C string functions we have 256 characters.
Before inserting marks for the stop characters we clear the whole
table. The unrolled form is much faster than a loop. */
xorl %ecx, %ecx /* %ecx = 0 !!! */
pushl %ecx /* make a 256 bytes long block filled with 0 */
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl $0 /* These immediate values make the label 2 */
pushl $0 /* to be aligned on a 16 byte boundary to */
pushl $0 /* get a better performance of the loop. */
pushl $0
pushl $0
pushl $0
/* For understanding the following code remember that %ecx == 0 now.
Although all the following instruction only modify %cl we always
have a correct zero-extended 32-bit value in %ecx. */
/* Don't change the "testb $0xff,%%cl" to "testb %%cl,%%cl". We want
longer instructions so that the next loop aligns without adding nops. */
L2: movb (%eax), %cl /* get byte from stopset */
testb %cl, %cl /* is NUL char? */
jz L1 /* yes => start compare loop */
movb %cl, (%esp,%ecx) /* set corresponding byte in stopset table */
movb 1(%eax), %cl /* get byte from stopset */
testb $0xff, %cl /* is NUL char? */
jz L1 /* yes => start compare loop */
movb %cl, (%esp,%ecx) /* set corresponding byte in stopset table */
movb 2(%eax), %cl /* get byte from stopset */
testb $0xff, %cl /* is NUL char? */
jz L1 /* yes => start compare loop */
movb %cl, (%esp,%ecx) /* set corresponding byte in stopset table */
movb 3(%eax), %cl /* get byte from stopset */
addl $4, %eax /* increment stopset pointer */
movb %cl, (%esp,%ecx) /* set corresponding byte in stopset table */
testb $0xff, %cl /* is NUL char? */
jnz L2 /* no => process next dword from stopset */
L1: leal -4(%edx), %eax /* prepare loop */
/* We use a neat trick for the following loop. Normally we would
have to test for two termination conditions
1. a character in the stopset was found
and
2. the end of the string was found
But as a sign that the chracter is in the stopset we store its
value in the table. But the value of NUL is NUL so the loop
terminates for NUL in every case. */
L3: addl $4, %eax /* adjust pointer for full loop round */
movb (%eax), %cl /* get byte from string */
cmpb %cl, (%esp,%ecx) /* is it contained in stopset? */
je L4 /* yes => return */
movb 1(%eax), %cl /* get byte from string */
cmpb %cl, (%esp,%ecx) /* is it contained in stopset? */
je L5 /* yes => return */
movb 2(%eax), %cl /* get byte from string */
cmpb %cl, (%esp,%ecx) /* is it contained in stopset? */
je L6 /* yes => return */
movb 3(%eax), %cl /* get byte from string */
cmpb %cl, (%esp,%ecx) /* is it contained in stopset? */
jne L3 /* yes => return */
incl %eax /* adjust pointer */
L6: incl %eax
L5: incl %eax
L4: addl $256, %esp /* remove stopset */
orb %cl, %cl /* was last character NUL? */
jnz L7 /* no => return pointer */
xorl %eax, %eax /* return NULL */
L7: ret

321
sysdeps/i386/strrchr.S Normal file
View File

@ -0,0 +1,321 @@
/* strchr (str, ch) -- Return pointer to last occurrence of CH in STR.
For Intel 80x86, x>=3.
Copyright (C) 1994, 1995 Free Software Foundation, Inc.
Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>
Some optimisations by Alan Modra <Alan@SPRI.Levels.UniSA.Edu.Au>
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Library General Public License as
published by the Free Software Foundation; either version 2 of the
License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Library General Public License for more details.
You should have received a copy of the GNU Library General Public
License along with the GNU C Library; see the file COPYING.LIB. If
not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
Boston, MA 02111-1307, USA. */
#include <sysdep.h>
#include "asm-syntax.h"
/*
INPUT PARAMETERS:
str (sp + 4)
ch (sp + 8)
*/
.text
ENTRY (strrchr)
pushl %edi /* Save callee-safe registers used here. */
pushl %esi
xorl %eax, %eax
movl 12(%esp), %esi /* get string pointer */
movl 16(%esp), %ecx /* get character we are looking for */
/* At the moment %ecx contains C. What we need for the
algorithm is C in all bytes of the dword. Avoid
operations on 16 bit words because these require an
prefix byte (and one more cycle). */
movb %cl, %ch /* now it is 0|0|c|c */
movl %ecx, %edx
shll $16, %ecx /* now it is c|c|0|0 */
movw %dx, %cx /* and finally c|c|c|c */
/* Before we start with the main loop we process single bytes
until the source pointer is aligned. This has two reasons:
1. aligned 32-bit memory access is faster
and (more important)
2. we process in the main loop 32 bit in one step although
we don't know the end of the string. But accessing at
4-byte alignment guarantees that we never access illegal
memory if this would not also be done by the trivial
implementation (this is because all processor inherant
boundaries are multiples of 4. */
testb $3, %esi /* correctly aligned ? */
jz L19 /* yes => begin loop */
movb (%esi), %dl /* load byte in question (we need it twice) */
cmpb %dl, %cl /* compare byte */
jne L11 /* target found => return */
movl %esi, %eax /* remember pointer as possible result */
L11: orb %dl, %dl /* is NUL? */
jz L2 /* yes => return NULL */
incl %esi /* increment pointer */
testb $3, %esi /* correctly aligned ? */
jz L19 /* yes => begin loop */
movb (%esi), %dl /* load byte in question (we need it twice) */
cmpb %dl, %cl /* compare byte */
jne L12 /* target found => return */
movl %esi, %eax /* remember pointer as result */
L12: orb %dl, %dl /* is NUL? */
jz L2 /* yes => return NULL */
incl %esi /* increment pointer */
testb $3, %esi /* correctly aligned ? */
jz L19 /* yes => begin loop */
movb (%esi), %dl /* load byte in question (we need it twice) */
cmpb %dl, %cl /* compare byte */
jne L13 /* target found => return */
movl %esi, %eax /* remember pointer as result */
L13: orb %cl, %cl /* is NUL? */
jz L2 /* yes => return NULL */
incl %esi /* increment pointer */
/* No we have reached alignment. */
jmp L19 /* begin loop */
/* We exit the loop if adding MAGIC_BITS to LONGWORD fails to
change any of the hole bits of LONGWORD.
1) Is this safe? Will it catch all the zero bytes?
Suppose there is a byte with all zeros. Any carry bits
propagating from its left will fall into the hole at its
least significant bit and stop. Since there will be no
carry from its most significant bit, the LSB of the
byte to the left will be unchanged, and the zero will be
detected.
2) Is this worthwhile? Will it ignore everything except
zero bytes? Suppose every byte of LONGWORD has a bit set
somewhere. There will be a carry into bit 8. If bit 8
is set, this will carry into bit 16. If bit 8 is clear,
one of bits 9-15 must be set, so there will be a carry
into bit 16. Similarly, there will be a carry into bit
24. If one of bits 24-31 is set, there will be a carry
into bit 32 (=carry flag), so all of the hole bits will
be changed.
3) But wait! Aren't we looking for C, not zero?
Good point. So what we do is XOR LONGWORD with a longword,
each of whose bytes is C. This turns each byte that is C
into a zero. */
/* Each round the main loop processes 16 bytes. */
/* Jump to here when the character is detected. We chose this
way around because the character one is looking for is not
as frequent as the rest and taking a conditional jump is more
expensive than ignoring it.
Some more words to the code below: it might not be obvious why
we decrement the source pointer here. In the loop the pointer
is not pre-incremented and so it still points before the word
we are looking at. But you should take a look at the instruction
which gets executed before we get into the loop: `addl $16, %esi'.
This makes the following subs into adds. */
/* These fill bytes make the main loop be correctly aligned.
We cannot use align because it is not the following instruction
which should be aligned. */
.byte 0, 0, 0, 0, 0, 0, 0, 0
L4: subl $4, %esi /* adjust pointer */
L41: subl $4, %esi
L42: subl $4, %esi
L43: testl $0xff000000, %edx /* is highest byte == C? */
jnz L33 /* no => try other bytes */
leal 15(%esi), %eax /* store address as result */
jmp L1 /* and start loop again */
L3: subl $4, %esi /* adjust pointer */
L31: subl $4, %esi
L32: subl $4, %esi
L33: testl $0xff0000, %edx /* is C in third byte? */
jnz L51 /* no => try other bytes */
leal 14(%esi), %eax /* store address as result */
jmp L1 /* and start loop again */
L51:
/* At this point we know that the byte is in one of the lower bytes.
We make a guess and correct it if necessary. This reduces the
number of necessary jumps. */
leal 12(%esi), %eax /* guess address of lowest byte as result */
testb %dh, %dh /* is guess correct? */
jnz L1 /* yes => start loop */
leal 13(%esi), %eax /* correct guess to second byte */
L1: addl $16, %esi /* increment pointer for full round */
L19: movl (%esi), %edx /* get word (= 4 bytes) in question */
movl $0xfefefeff, %edi /* magic value */
addl %edx, %edi /* add the magic value to the word. We get
carry bits reported for each byte which
is *not* 0 */
/* According to the algorithm we had to reverse the effect of the
XOR first and then test the overflow bits. But because the
following XOR would destroy the carry flag and it would (in a
representation with more than 32 bits) not alter then last
overflow, we can now test this condition. If no carry is signaled
no overflow must have occured in the last byte => it was 0. */
jnc L20 /* found NUL => check last word */
/* We are only interested in carry bits that change due to the
previous add, so remove original bits */
xorl %edx, %edi /* (word+magic)^word */
/* Now test for the other three overflow bits. */
orl $0xfefefeff, %edi /* set all non-carry bits */
incl %edi /* add 1: if one carry bit was *not* set
the addition will not result in 0. */
/* If at least one byte of the word is C we don't get 0 in %edi. */
jnz L20 /* found NUL => check last word */
/* Now we made sure the dword does not contain the character we are
looking for. But because we deal with strings we have to check
for the end of string before testing the next dword. */
xorl %ecx, %edx /* XOR with word c|c|c|c => bytes of str == c
are now 0 */
movl $0xfefefeff, %edi /* magic value */
addl %edx, %edi /* add the magic value to the word. We get
carry bits reported for each byte which
is *not* 0 */
jnc L4 /* highest byte is C => examine dword */
xorl %edx, %edi /* ((word^charmask)+magic)^(word^charmask) */
orl $0xfefefeff, %edi /* set all non-carry bits */
incl %edi /* add 1: if one carry bit was *not* set
the addition will not result in 0. */
jnz L3 /* C is detected in the word => examine it */
movl 4(%esi), %edx /* get word (= 4 bytes) in question */
movl $0xfefefeff, %edi /* magic value */
addl %edx, %edi /* add the magic value to the word. We get
carry bits reported for each byte which
is *not* 0 */
jnc L21 /* found NUL => check last word */
xorl %edx, %edi /* (word+magic)^word */
orl $0xfefefeff, %edi /* set all non-carry bits */
incl %edi /* add 1: if one carry bit was *not* set
the addition will not result in 0. */
jnz L21 /* found NUL => check last word */
xorl %ecx, %edx /* XOR with word c|c|c|c => bytes of str == c
are now 0 */
movl $0xfefefeff, %edi /* magic value */
addl %edx, %edi /* add the magic value to the word. We get
carry bits reported for each byte which
is *not* 0 */
jnc L41 /* highest byte is C => examine dword */
xorl %edx, %edi /* ((word^charmask)+magic)^(word^charmask) */
orl $0xfefefeff, %edi /* set all non-carry bits */
incl %edi /* add 1: if one carry bit was *not* set
the addition will not result in 0. */
jnz L31 /* C is detected in the word => examine it */
movl 8(%esi), %edx /* get word (= 4 bytes) in question */
movl $0xfefefeff, %edi /* magic value */
addl %edx, %edi /* add the magic value to the word. We get
carry bits reported for each byte which
is *not* 0 */
jnc L22 /* found NUL => check last word */
xorl %edx, %edi /* (word+magic)^word */
orl $0xfefefeff, %edi /* set all non-carry bits */
incl %edi /* add 1: if one carry bit was *not* set
the addition will not result in 0. */
jnz L22 /* found NUL => check last word */
xorl %ecx, %edx /* XOR with word c|c|c|c => bytes of str == c
are now 0 */
movl $0xfefefeff, %edi /* magic value */
addl %edx, %edi /* add the magic value to the word. We get
carry bits reported for each byte which
is *not* 0 */
jnc L42 /* highest byte is C => examine dword */
xorl %edx, %edi /* ((word^charmask)+magic)^(word^charmask) */
orl $0xfefefeff, %edi /* set all non-carry bits */
incl %edi /* add 1: if one carry bit was *not* set
the addition will not result in 0. */
jnz L32 /* C is detected in the word => examine it */
movl 12(%esi), %edx /* get word (= 4 bytes) in question */
movl $0xfefefeff, %edi /* magic value */
addl %edx, %edi /* add the magic value to the word. We get
carry bits reported for each byte which
is *not* 0 */
jnc L23 /* found NUL => check last word */
xorl %edx, %edi /* (word+magic)^word */
orl $0xfefefeff, %edi /* set all non-carry bits */
incl %edi /* add 1: if one carry bit was *not* set
the addition will not result in 0. */
jnz L23 /* found NUL => check last word */
xorl %ecx, %edx /* XOR with word c|c|c|c => bytes of str == c
are now 0 */
movl $0xfefefeff, %edi /* magic value */
addl %edx, %edi /* add the magic value to the word. We get
carry bits reported for each byte which
is *not* 0 */
jnc L43 /* highest byte is C => examine dword */
xorl %edx, %edi /* ((word^charmask)+magic)^(word^charmask) */
orl $0xfefefeff, %edi /* set all non-carry bits */
incl %edi /* add 1: if one carry bit was *not* set
the addition will not result in 0. */
jz L1 /* C is not detected => restart loop */
jmp L33 /* examine word */
L23: addl $4, %esi /* adjust pointer */
L22: addl $4, %esi
L21: addl $4, %esi
/* What remains to do is to test which byte the NUL char is and
whether the searched character appears in one of the bytes
before. A special case is that the searched byte maybe NUL.
In this case a pointer to the terminating NUL char has to be
returned. */
L20: cmpb %cl, %dl /* is first byte == C? */
jne L24 /* no => skip */
movl %esi, %eax /* store address as result */
L24: testb %dl, %dl /* is first byte == NUL? */
jz L2 /* yes => return */
cmpb %cl, %dh /* is second byte == C? */
jne L25 /* no => skip */
leal 1(%esi), %eax /* store address as result */
L25: testb %dh, %dh /* is second byte == NUL? */
jz L2 /* yes => return */
shrl $16,%edx /* make upper bytes accessible */
cmpb %cl, %dl /* is third byte == C */
jne L26 /* no => skip */
leal 2(%esi), %eax /* store address as result */
L26: testb %dl, %dl /* is third byte == NUL */
jz L2 /* yes => return */
cmpb %cl, %dh /* is fourth byte == C */
jne L2 /* no => skip */
leal 3(%esi), %eax /* store address as result */
L2: popl %esi /* restore saved register content */
popl %edi
ret
weak_alias (strrchr, rindex)

176
sysdeps/i386/strspn.S Normal file
View File

@ -0,0 +1,176 @@
/* strcspn (str, ss) -- Return the length of the initial segement of STR
which contains only characters from SS.
For Intel 80x86, x>=3.
Copyright (C) 1994, 1995 Free Software Foundation, Inc.
Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>
Bug fixes by Alan Modra <Alan@SPRI.Levels.UniSA.Edu.Au>
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Library General Public License as
published by the Free Software Foundation; either version 2 of the
License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Library General Public License for more details.
You should have received a copy of the GNU Library General Public
License along with the GNU C Library; see the file COPYING.LIB. If
not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
Boston, MA 02111-1307, USA. */
#include <sysdep.h>
#include "asm-syntax.h"
/*
INPUT PARAMETERS:
str (sp + 4)
skipset (sp + 8)
*/
.text
ENTRY (strspn)
movl 4(%esp), %edx /* get string pointer */
movl 8(%esp), %eax /* get skipset pointer */
/* First we create a table with flags for all possible characters.
For the ASCII (7bit/8bit) or ISO-8859-X character sets which are
supported by the C string functions we have 256 characters.
Before inserting marks for the stop characters we clear the whole
table. The unrolled form is much faster than a loop. */
xorl %ecx, %ecx /* %ecx = 0 !!! */
pushl %ecx /* make a 256 bytes long block filled with 0 */
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl %ecx
pushl $0 /* These immediate values make the label 2 */
pushl $0 /* to be aligned on a 16 byte boundary to */
pushl $0 /* get a better performance of the loop. */
pushl $0
pushl $0
pushl $0
/* For understanding the following code remember that %ecx == 0 now.
Although all the following instruction only modify %cl we always
have a correct zero-extended 32-bit value in %ecx. */
/* Don't change the "testb $0xff,%%cl" to "testb %%cl,%%cl". We want
longer instructions so that the next loop aligns without adding nops. */
L2: movb (%eax), %cl /* get byte from stopset */
testb %cl, %cl /* is NUL char? */
jz L1 /* yes => start compare loop */
movb %cl, (%esp,%ecx) /* set corresponding byte in stopset table */
movb 1(%eax), %cl /* get byte from stopset */
testb $0xff, %cl /* is NUL char? */
jz L1 /* yes => start compare loop */
movb %cl, (%esp,%ecx) /* set corresponding byte in stopset table */
movb 2(%eax), %cl /* get byte from stopset */
testb $0xff, %cl /* is NUL char? */
jz L1 /* yes => start compare loop */
movb %cl, (%esp,%ecx) /* set corresponding byte in stopset table */
movb 3(%eax), %cl /* get byte from stopset */
addl $4, %eax /* increment stopset pointer */
movb %cl, (%esp,%ecx) /* set corresponding byte in stopset table */
testb $0xff, %cl /* is NUL char? */
jnz L2 /* no => process next dword from stopset */
L1: leal -4(%edx), %eax /* prepare loop */
/* We use a neat trick for the following loop. Normally we would
have to test for two termination conditions
1. a character in the stopset was found
and
2. the end of the string was found
But as a sign that the chracter is in the stopset we store its
value in the table. But the value of NUL is NUL so the loop
terminates for NUL in every case. */
L3: addl $4, %eax /* adjust pointer for full loop round */
movb (%eax), %cl /* get byte from string */
testb %cl, (%esp,%ecx) /* is it contained in skipset? */
jz L4 /* no => return */
movb 1(%eax), %cl /* get byte from string */
testb %cl, (%esp,%ecx) /* is it contained in skipset? */
jz L5 /* no => return */
movb 2(%eax), %cl /* get byte from string */
testb %cl, (%esp,%ecx) /* is it contained in skipset? */
jz L6 /* no => return */
movb 3(%eax), %cl /* get byte from string */
testb %cl, (%esp,%ecx) /* is it contained in skipset? */
jnz L3 /* yes => start loop again */
incl %eax /* adjust pointer */
L6: incl %eax
L5: incl %eax
L4: subl %edx, %eax /* we have to return the number of valid
characters, so compute distance to first
non-valid character */
addl $256, %esp /* remove stopset */
ret

View File

@ -1,7 +1,7 @@
/* i80386 __mpn_sub_n -- Add two limb vectors of the same length > 0 and store
sum in a third limb vector.
Copyright (C) 1992, 1994 Free Software Foundation, Inc.
Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
@ -54,14 +54,18 @@ C_SYMBOL_NAME(__mpn_sub_n:)
subl %eax,%edx /* ... enter the loop */
shrl $2,%eax /* restore previous value */
#ifdef PIC
call here
here: leal (Loop - 3 - here)(%eax,%eax,8),%eax
addl %eax,(%esp)
ret
/* Calculate start address in loop for PIC. Due to limitations in some
assemblers, Loop-L0-3 cannot be put into the leal */
call L0
L0: leal (%eax,%eax,8),%eax
addl (%esp),%eax
addl $(Loop-L0-3),%eax
addl $4,%esp
#else
leal (Loop - 3)(%eax,%eax,8),%eax /* calc start addr in loop */
jmp *%eax /* jump into loop */
/* Calculate start address in loop for non-PIC. */
leal (Loop - 3)(%eax,%eax,8),%eax
#endif
jmp *%eax /* jump into loop */
ALIGN (3)
Loop: movl (%esi),%eax
sbbl (%edx),%eax

21
sysdeps/i960/add_n.s Normal file
View File

@ -0,0 +1,21 @@
.text
.align 4
.globl ___mpn_add_n
___mpn_add_n:
mov 0,g6 # clear carry-save register
cmpo 1,0 # clear cy
Loop: subo 1,g3,g3 # update loop counter
ld (g1),g5 # load from s1_ptr
addo 4,g1,g1 # s1_ptr++
ld (g2),g4 # load from s2_ptr
addo 4,g2,g2 # s2_ptr++
cmpo g6,1 # restore cy from g6, relies on cy being 0
addc g4,g5,g4 # main add
subc 0,0,g6 # save cy in g6
st g4,(g0) # store result to res_ptr
addo 4,g0,g0 # res_ptr++
cmpobne 0,g3,Loop # when branch is taken, clears C bit
mov g6,g0
ret

26
sysdeps/i960/addmul_1.s Normal file
View File

@ -0,0 +1,26 @@
.text
.align 4
.globl ___mpn_mul_1
___mpn_mul_1:
subo g2,0,g2
shlo 2,g2,g4
subo g4,g1,g1
subo g4,g0,g13
mov 0,g0
cmpo 1,0 # clear C bit on AC.cc
Loop: ld (g1)[g2*4],g5
emul g3,g5,g6
ld (g13)[g2*4],g5
addc g0,g6,g6 # relies on that C bit is clear
addc 0,g7,g7
addc g5,g6,g6 # relies on that C bit is clear
st g6,(g13)[g2*4]
addc 0,g7,g0
addo g2,1,g2
cmpobne 0,g2,Loop # when branch is taken, clears C bit
ret

23
sysdeps/i960/mul_1.s Normal file
View File

@ -0,0 +1,23 @@
.text
.align 4
.globl ___mpn_mul_1
___mpn_mul_1:
subo g2,0,g2
shlo 2,g2,g4
subo g4,g1,g1
subo g4,g0,g13
mov 0,g0
cmpo 1,0 # clear C bit on AC.cc
Loop: ld (g1)[g2*4],g5
emul g3,g5,g6
addc g0,g6,g6 # relies on that C bit is clear
st g6,(g13)[g2*4]
addc 0,g7,g0
addo g2,1,g2
cmpobne 0,g2,Loop # when branch is taken, clears C bit
ret

21
sysdeps/i960/sub_n.s Normal file
View File

@ -0,0 +1,21 @@
.text
.align 4
.globl ___mpn_sub_n
___mpn_sub_n:
mov 1,g6 # set carry-save register
cmpo 1,0 # clear cy
Loop: subo 1,g3,g3 # update loop counter
ld (g1),g5 # load from s1_ptr
addo 4,g1,g1 # s1_ptr++
ld (g2),g4 # load from s2_ptr
addo 4,g2,g2 # s2_ptr++
cmpo g6,1 # restore cy from g6, relies on cy being 0
subc g4,g5,g4 # main subtract
subc 0,0,g6 # save cy in g6
st g4,(g0) # store result to res_ptr
addo 4,g0,g0 # res_ptr++
cmpobne 0,g3,Loop # when branch is taken, cy will be 0
mov g6,g0
ret

103
sysdeps/m88k/m88100/add_n.s Normal file
View File

@ -0,0 +1,103 @@
; mc88100 __mpn_add -- Add two limb vectors of the same length > 0 and store
; sum in a third limb vector.
; Copyright (C) 1992, 1994 Free Software Foundation, Inc.
; This file is part of the GNU MP Library.
; The GNU MP Library is free software; you can redistribute it and/or modify
; it under the terms of the GNU Library General Public License as published by
; the Free Software Foundation; either version 2 of the License, or (at your
; option) any later version.
; The GNU MP Library is distributed in the hope that it will be useful, but
; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
; License for more details.
; You should have received a copy of the GNU Library General Public License
; along with the GNU MP Library; see the file COPYING.LIB. If not, write to
; the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
; INPUT PARAMETERS
; res_ptr r2
; s1_ptr r3
; s2_ptr r4
; size r5
; This code has been optimized to run one instruction per clock, avoiding
; load stalls and writeback contention. As a result, the instruction
; order is not always natural.
; The speed is about 4.6 clocks/limb + 18 clocks/limb-vector on an 88100,
; but on the 88110, it seems to run much slower, 6.6 clocks/limb.
text
align 16
global ___mpn_add_n
___mpn_add_n:
ld r6,r3,0 ; read first limb from s1_ptr
extu r10,r5,3
ld r7,r4,0 ; read first limb from s2_ptr
subu.co r5,r0,r5 ; (clear carry as side effect)
mak r5,r5,3<4>
bcnd eq0,r5,Lzero
or r12,r0,lo16(Lbase)
or.u r12,r12,hi16(Lbase)
addu r12,r12,r5 ; r12 is address for entering in loop
extu r5,r5,2 ; divide by 4
subu r2,r2,r5 ; adjust res_ptr
subu r3,r3,r5 ; adjust s1_ptr
subu r4,r4,r5 ; adjust s2_ptr
or r8,r6,r0
jmp.n r12
or r9,r7,r0
Loop: addu r3,r3,32
st r8,r2,28
addu r4,r4,32
ld r6,r3,0
addu r2,r2,32
ld r7,r4,0
Lzero: subu r10,r10,1 ; add 0 + 8r limbs (adj loop cnt)
Lbase: ld r8,r3,4
addu.cio r6,r6,r7
ld r9,r4,4
st r6,r2,0
ld r6,r3,8 ; add 7 + 8r limbs
addu.cio r8,r8,r9
ld r7,r4,8
st r8,r2,4
ld r8,r3,12 ; add 6 + 8r limbs
addu.cio r6,r6,r7
ld r9,r4,12
st r6,r2,8
ld r6,r3,16 ; add 5 + 8r limbs
addu.cio r8,r8,r9
ld r7,r4,16
st r8,r2,12
ld r8,r3,20 ; add 4 + 8r limbs
addu.cio r6,r6,r7
ld r9,r4,20
st r6,r2,16
ld r6,r3,24 ; add 3 + 8r limbs
addu.cio r8,r8,r9
ld r7,r4,24
st r8,r2,20
ld r8,r3,28 ; add 2 + 8r limbs
addu.cio r6,r6,r7
ld r9,r4,28
st r6,r2,24
bcnd.n ne0,r10,Loop ; add 1 + 8r limbs
addu.cio r8,r8,r9
st r8,r2,28 ; store most significant limb
jmp.n r1
addu.ci r2,r0,r0 ; return carry-out from most sign. limb

128
sysdeps/m88k/m88100/mul_1.s Normal file
View File

@ -0,0 +1,128 @@
; mc88100 __mpn_mul_1 -- Multiply a limb vector with a single limb and
; store the product in a second limb vector.
; Copyright (C) 1992, 1994 Free Software Foundation, Inc.
; This file is part of the GNU MP Library.
; The GNU MP Library is free software; you can redistribute it and/or modify
; it under the terms of the GNU Library General Public License as published by
; the Free Software Foundation; either version 2 of the License, or (at your
; option) any later version.
; The GNU MP Library is distributed in the hope that it will be useful, but
; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
; License for more details.
; You should have received a copy of the GNU Library General Public License
; along with the GNU MP Library; see the file COPYING.LIB. If not, write to
; the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
; INPUT PARAMETERS
; res_ptr r2
; s1_ptr r3
; size r4
; s2_limb r5
; Common overhead is about 11 cycles/invocation.
; The speed for S2_LIMB >= 0x10000 is approximately 21 cycles/limb. (The
; pipeline stalls 2 cycles due to WB contention.)
; The speed for S2_LIMB < 0x10000 is approximately 16 cycles/limb. (The
; pipeline stalls 2 cycles due to WB contention and 1 cycle due to latency.)
; To enhance speed:
; 1. Unroll main loop 4-8 times.
; 2. Schedule code to avoid WB contention. It might be tempting to move the
; ld instruction in the loops down to save 2 cycles (less WB contention),
; but that looses because the ultimate value will be read from outside
; the allocated space. But if we handle the ultimate multiplication in
; the tail, we can do this.
; 3. Make the multiplication with less instructions. I think the code for
; (S2_LIMB >= 0x10000) is not minimal.
; With these techniques the (S2_LIMB >= 0x10000) case would run in 17 or
; less cycles/limb; the (S2_LIMB < 0x10000) case would run in 11
; cycles/limb. (Assuming infinite unrolling.)
text
align 16
global ___mpn_mul_1
___mpn_mul_1:
; Make S1_PTR and RES_PTR point at the end of their blocks
; and negate SIZE.
lda r3,r3[r4]
lda r6,r2[r4] ; RES_PTR in r6 since r2 is retval
subu r4,r0,r4
addu.co r2,r0,r0 ; r2 = cy = 0
ld r9,r3[r4]
mask r7,r5,0xffff ; r7 = lo(S2_LIMB)
extu r8,r5,16 ; r8 = hi(S2_LIMB)
bcnd.n eq0,r8,Lsmall ; jump if (hi(S2_LIMB) == 0)
subu r6,r6,4
; General code for any value of S2_LIMB.
; Make a stack frame and save r25 and r26
subu r31,r31,16
st.d r25,r31,8
; Enter the loop in the middle
br.n L1
addu r4,r4,1
Loop:
ld r9,r3[r4]
st r26,r6[r4]
; bcnd ne0,r0,0 ; bubble
addu r4,r4,1
L1: mul r26,r9,r5 ; low word of product mul_1 WB ld
mask r12,r9,0xffff ; r12 = lo(s1_limb) mask_1
mul r11,r12,r7 ; r11 = prod_0 mul_2 WB mask_1
mul r10,r12,r8 ; r10 = prod_1a mul_3
extu r13,r9,16 ; r13 = hi(s1_limb) extu_1 WB mul_1
mul r12,r13,r7 ; r12 = prod_1b mul_4 WB extu_1
mul r25,r13,r8 ; r25 = prod_2 mul_5 WB mul_2
extu r11,r11,16 ; r11 = hi(prod_0) extu_2 WB mul_3
addu r10,r10,r11 ; addu_1 WB extu_2
; bcnd ne0,r0,0 ; bubble WB addu_1
addu.co r10,r10,r12 ; WB mul_4
mask.u r10,r10,0xffff ; move the 16 most significant bits...
addu.ci r10,r10,r0 ; ...to the low half of the word...
rot r10,r10,16 ; ...and put carry in pos 16.
addu.co r26,r26,r2 ; add old carry limb
bcnd.n ne0,r4,Loop
addu.ci r2,r25,r10 ; compute new carry limb
st r26,r6[r4]
ld.d r25,r31,8
jmp.n r1
addu r31,r31,16
; Fast code for S2_LIMB < 0x10000
Lsmall:
; Enter the loop in the middle
br.n SL1
addu r4,r4,1
SLoop:
ld r9,r3[r4] ;
st r8,r6[r4] ;
addu r4,r4,1 ;
SL1: mul r8,r9,r5 ; low word of product
mask r12,r9,0xffff ; r12 = lo(s1_limb)
extu r13,r9,16 ; r13 = hi(s1_limb)
mul r11,r12,r7 ; r11 = prod_0
mul r12,r13,r7 ; r12 = prod_1b
addu.cio r8,r8,r2 ; add old carry limb
extu r10,r11,16 ; r11 = hi(prod_0)
addu r10,r10,r12 ;
bcnd.n ne0,r4,SLoop
extu r2,r10,16 ; r2 = new carry limb
jmp.n r1
st r8,r6[r4]

104
sysdeps/m88k/m88100/sub_n.s Normal file
View File

@ -0,0 +1,104 @@
; mc88100 __mpn_sub -- Subtract two limb vectors of the same length > 0 and
; store difference in a third limb vector.
; Copyright (C) 1992, 1994 Free Software Foundation, Inc.
; This file is part of the GNU MP Library.
; The GNU MP Library is free software; you can redistribute it and/or modify
; it under the terms of the GNU Library General Public License as published by
; the Free Software Foundation; either version 2 of the License, or (at your
; option) any later version.
; The GNU MP Library is distributed in the hope that it will be useful, but
; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
; License for more details.
; You should have received a copy of the GNU Library General Public License
; along with the GNU MP Library; see the file COPYING.LIB. If not, write to
; the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
; INPUT PARAMETERS
; res_ptr r2
; s1_ptr r3
; s2_ptr r4
; size r5
; This code has been optimized to run one instruction per clock, avoiding
; load stalls and writeback contention. As a result, the instruction
; order is not always natural.
; The speed is about 4.6 clocks/limb + 18 clocks/limb-vector on an 88100,
; but on the 88110, it seems to run much slower, 6.6 clocks/limb.
text
align 16
global ___mpn_sub_n
___mpn_sub_n:
ld r6,r3,0 ; read first limb from s1_ptr
extu r10,r5,3
ld r7,r4,0 ; read first limb from s2_ptr
subu.co r5,r0,r5 ; (clear carry as side effect)
mak r5,r5,3<4>
bcnd eq0,r5,Lzero
or r12,r0,lo16(Lbase)
or.u r12,r12,hi16(Lbase)
addu r12,r12,r5 ; r12 is address for entering in loop
extu r5,r5,2 ; divide by 4
subu r2,r2,r5 ; adjust res_ptr
subu r3,r3,r5 ; adjust s1_ptr
subu r4,r4,r5 ; adjust s2_ptr
or r8,r6,r0
jmp.n r12
or r9,r7,r0
Loop: addu r3,r3,32
st r8,r2,28
addu r4,r4,32
ld r6,r3,0
addu r2,r2,32
ld r7,r4,0
Lzero: subu r10,r10,1 ; subtract 0 + 8r limbs (adj loop cnt)
Lbase: ld r8,r3,4
subu.cio r6,r6,r7
ld r9,r4,4
st r6,r2,0
ld r6,r3,8 ; subtract 7 + 8r limbs
subu.cio r8,r8,r9
ld r7,r4,8
st r8,r2,4
ld r8,r3,12 ; subtract 6 + 8r limbs
subu.cio r6,r6,r7
ld r9,r4,12
st r6,r2,8
ld r6,r3,16 ; subtract 5 + 8r limbs
subu.cio r8,r8,r9
ld r7,r4,16
st r8,r2,12
ld r8,r3,20 ; subtract 4 + 8r limbs
subu.cio r6,r6,r7
ld r9,r4,20
st r6,r2,16
ld r6,r3,24 ; subtract 3 + 8r limbs
subu.cio r8,r8,r9
ld r7,r4,24
st r8,r2,20
ld r8,r3,28 ; subtract 2 + 8r limbs
subu.cio r6,r6,r7
ld r9,r4,28
st r6,r2,24
bcnd.n ne0,r10,Loop ; subtract 1 + 8r limbs
subu.cio r8,r8,r9
st r8,r2,28 ; store most significant limb
addu.ci r2,r0,r0 ; return carry-out from most sign. limb
jmp.n r1
xor r2,r2,1

View File

@ -0,0 +1,84 @@
; mc88110 __mpn_mul_1 -- Multiply a limb vector with a single limb and
; store the product in a second limb vector.
; Copyright (C) 1992, 1994 Free Software Foundation, Inc.
; This file is part of the GNU MP Library.
; The GNU MP Library is free software; you can redistribute it and/or modify
; it under the terms of the GNU Library General Public License as published by
; the Free Software Foundation; either version 2 of the License, or (at your
; option) any later version.
; The GNU MP Library is distributed in the hope that it will be useful, but
; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
; License for more details.
; You should have received a copy of the GNU Library General Public License
; along with the GNU MP Library; see the file COPYING.LIB. If not, write to
; the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
; INPUT PARAMETERS
; res_ptr r2
; s1_ptr r3
; size r4
; s2_limb r5
text
align 16
global ___mpn_mul_1
___mpn_mul_1:
; Make S1_PTR and RES_PTR point at the end of their blocks
; and negate SIZE.
lda r3,r3[r4]
lda r8,r2[r4] ; RES_PTR in r8 since r2 is retval
subu r4,r0,r4
addu.co r2,r0,r0 ; r2 = cy = 0
ld r6,r3[r4]
addu r4,r4,1
mulu.d r10,r6,r5
bcnd.n eq0,r4,Lend
subu r8,r8,8
Loop: ld r6,r3[r4]
addu.cio r9,r11,r2
or r2,r10,r0 ; could be avoided if unrolled
addu r4,r4,1
mulu.d r10,r6,r5
bcnd.n ne0,r4,Loop
st r9,r8[r4]
Lend: addu.cio r9,r11,r2
st r9,r8,4
jmp.n r1
addu.ci r2,r10,r0
; This is the Right Way to do this on '110. 4 cycles / 64-bit limb.
; ld.d r10,
; mulu.d
; addu.cio
; addu.cio
; st.d
; mulu.d ,r11,r5
; ld.d r12,
; mulu.d ,r10,r5
; addu.cio
; addu.cio
; st.d
; mulu.d
; ld.d r10,
; mulu.d
; addu.cio
; addu.cio
; st.d
; mulu.d
; ld.d r10,
; mulu.d
; addu.cio
; addu.cio
; st.d
; mulu.d

119
sysdeps/mips/add_n.s Normal file
View File

@ -0,0 +1,119 @@
# MIPS2 __mpn_add_n -- Add two limb vectors of the same length > 0 and
# store sum in a third limb vector.
# Copyright (C) 1995 Free Software Foundation, Inc.
# This file is part of the GNU MP Library.
# The GNU MP Library is free software; you can redistribute it and/or modify
# it under the terms of the GNU Library General Public License as published by
# the Free Software Foundation; either version 2 of the License, or (at your
# option) any later version.
# The GNU MP Library is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
# License for more details.
# You should have received a copy of the GNU Library General Public License
# along with the GNU MP Library; see the file COPYING.LIB. If not, write to
# the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
# INPUT PARAMETERS
# res_ptr $4
# s1_ptr $5
# s2_ptr $6
# size $7
.text
.align 2
.globl __mpn_add_n
.ent __mpn_add_n
__mpn_add_n:
.set noreorder
.set nomacro
lw $10,0($5)
lw $11,0($6)
addiu $7,$7,-1
and $9,$7,4-1 # number of limbs in first loop
beq $9,$0,.L0 # if multiple of 4 limbs, skip first loop
move $2,$0
subu $7,$7,$9
.Loop0: addiu $9,$9,-1
lw $12,4($5)
addu $11,$11,$2
lw $13,4($6)
sltu $8,$11,$2
addu $11,$10,$11
sltu $2,$11,$10
sw $11,0($4)
or $2,$2,$8
addiu $5,$5,4
addiu $6,$6,4
move $10,$12
move $11,$13
bne $9,$0,.Loop0
addiu $4,$4,4
.L0: beq $7,$0,.Lend
nop
.Loop: addiu $7,$7,-4
lw $12,4($5)
addu $11,$11,$2
lw $13,4($6)
sltu $8,$11,$2
addu $11,$10,$11
sltu $2,$11,$10
sw $11,0($4)
or $2,$2,$8
lw $10,8($5)
addu $13,$13,$2
lw $11,8($6)
sltu $8,$13,$2
addu $13,$12,$13
sltu $2,$13,$12
sw $13,4($4)
or $2,$2,$8
lw $12,12($5)
addu $11,$11,$2
lw $13,12($6)
sltu $8,$11,$2
addu $11,$10,$11
sltu $2,$11,$10
sw $11,8($4)
or $2,$2,$8
lw $10,16($5)
addu $13,$13,$2
lw $11,16($6)
sltu $8,$13,$2
addu $13,$12,$13
sltu $2,$13,$12
sw $13,12($4)
or $2,$2,$8
addiu $5,$5,16
addiu $6,$6,16
bne $7,$0,.Loop
addiu $4,$4,16
.Lend: addu $11,$11,$2
sltu $8,$11,$2
addu $11,$10,$11
sltu $2,$11,$10
sw $11,0($4)
j $31
or $2,$2,$8
.end __mpn_add_n

96
sysdeps/mips/addmul_1.s Normal file
View File

@ -0,0 +1,96 @@
# MIPS __mpn_addmul_1 -- Multiply a limb vector with a single limb and
# add the product to a second limb vector.
# Copyright (C) 1992, 1994 Free Software Foundation, Inc.
# This file is part of the GNU MP Library.
# The GNU MP Library is free software; you can redistribute it and/or modify
# it under the terms of the GNU Library General Public License as published by
# the Free Software Foundation; either version 2 of the License, or (at your
# option) any later version.
# The GNU MP Library is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
# License for more details.
# You should have received a copy of the GNU Library General Public License
# along with the GNU MP Library; see the file COPYING.LIB. If not, write to
# the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
# INPUT PARAMETERS
# res_ptr $4
# s1_ptr $5
# size $6
# s2_limb $7
.text
.align 4
.globl __mpn_addmul_1
.ent __mpn_addmul_1
__mpn_addmul_1:
.set noreorder
.set nomacro
# warm up phase 0
lw $8,0($5)
# warm up phase 1
addiu $5,$5,4
multu $8,$7
addiu $6,$6,-1
beq $6,$0,$LC0
move $2,$0 # zero cy2
addiu $6,$6,-1
beq $6,$0,$LC1
lw $8,0($5) # load new s1 limb as early as possible
Loop: lw $10,0($4)
mflo $3
mfhi $9
addiu $5,$5,4
addu $3,$3,$2 # add old carry limb to low product limb
multu $8,$7
lw $8,0($5) # load new s1 limb as early as possible
addiu $6,$6,-1 # decrement loop counter
sltu $2,$3,$2 # carry from previous addition -> $2
addu $3,$10,$3
sltu $10,$3,$10
addu $2,$2,$10
sw $3,0($4)
addiu $4,$4,4
bne $6,$0,Loop # should be "bnel"
addu $2,$9,$2 # add high product limb and carry from addition
# cool down phase 1
$LC1: lw $10,0($4)
mflo $3
mfhi $9
addu $3,$3,$2
sltu $2,$3,$2
multu $8,$7
addu $3,$10,$3
sltu $10,$3,$10
addu $2,$2,$10
sw $3,0($4)
addiu $4,$4,4
addu $2,$9,$2 # add high product limb and carry from addition
# cool down phase 0
$LC0: lw $10,0($4)
mflo $3
mfhi $9
addu $3,$3,$2
sltu $2,$3,$2
addu $3,$10,$3
sltu $10,$3,$10
addu $2,$2,$10
sw $3,0($4)
j $31
addu $2,$9,$2 # add high product limb and carry from addition
.end __mpn_addmul_1

94
sysdeps/mips/lshift.s Normal file
View File

@ -0,0 +1,94 @@
# MIPS2 __mpn_lshift --
# Copyright (C) 1995 Free Software Foundation, Inc.
# This file is part of the GNU MP Library.
# The GNU MP Library is free software; you can redistribute it and/or modify
# it under the terms of the GNU Library General Public License as published by
# the Free Software Foundation; either version 2 of the License, or (at your
# option) any later version.
# The GNU MP Library is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
# License for more details.
# You should have received a copy of the GNU Library General Public License
# along with the GNU MP Library; see the file COPYING.LIB. If not, write to
# the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
# INPUT PARAMETERS
# res_ptr $4
# src_ptr $5
# size $6
# cnt $7
.text
.align 2
.globl __mpn_lshift
.ent __mpn_lshift
__mpn_lshift:
.set noreorder
.set nomacro
sll $2,$6,2
addu $5,$5,$2 # make r5 point at end of src
lw $10,-4($5) # load first limb
subu $13,$0,$7
addu $4,$4,$2 # make r4 point at end of res
addiu $6,$6,-1
and $9,$6,4-1 # number of limbs in first loop
beq $9,$0,.L0 # if multiple of 4 limbs, skip first loop
srl $2,$10,$13 # compute function result
subu $6,$6,$9
.Loop0: lw $3,-8($5)
addiu $4,$4,-4
addiu $5,$5,-4
addiu $9,$9,-1
sll $11,$10,$7
srl $12,$3,$13
move $10,$3
or $8,$11,$12
bne $9,$0,.Loop0
sw $8,0($4)
.L0: beq $6,$0,.Lend
nop
.Loop: lw $3,-8($5)
addiu $4,$4,-16
addiu $6,$6,-4
sll $11,$10,$7
srl $12,$3,$13
lw $10,-12($5)
sll $14,$3,$7
or $8,$11,$12
sw $8,12($4)
srl $9,$10,$13
lw $3,-16($5)
sll $11,$10,$7
or $8,$14,$9
sw $8,8($4)
srl $12,$3,$13
lw $10,-20($5)
sll $14,$3,$7
or $8,$11,$12
sw $8,4($4)
srl $9,$10,$13
addiu $5,$5,-16
or $8,$14,$9
bgtz $6,.Loop
sw $8,0($4)
.Lend: sll $8,$10,$7
j $31
sw $8,-4($4)
.end __mpn_lshift

119
sysdeps/mips/mips3/add_n.s Normal file
View File

@ -0,0 +1,119 @@
# MIPS3 __mpn_add_n -- Add two limb vectors of the same length > 0 and
# store sum in a third limb vector.
# Copyright (C) 1995 Free Software Foundation, Inc.
# This file is part of the GNU MP Library.
# The GNU MP Library is free software; you can redistribute it and/or modify
# it under the terms of the GNU Library General Public License as published by
# the Free Software Foundation; either version 2 of the License, or (at your
# option) any later version.
# The GNU MP Library is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
# License for more details.
# You should have received a copy of the GNU Library General Public License
# along with the GNU MP Library; see the file COPYING.LIB. If not, write to
# the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
# INPUT PARAMETERS
# res_ptr $4
# s1_ptr $5
# s2_ptr $6
# size $7
.text
.align 2
.globl __mpn_add_n
.ent __mpn_add_n
__mpn_add_n:
.set noreorder
.set nomacro
ld $10,0($5)
ld $11,0($6)
daddiu $7,$7,-1
and $9,$7,4-1 # number of limbs in first loop
beq $9,$0,.L0 # if multiple of 4 limbs, skip first loop
move $2,$0
dsubu $7,$7,$9
.Loop0: daddiu $9,$9,-1
ld $12,8($5)
daddu $11,$11,$2
ld $13,8($6)
sltu $8,$11,$2
daddu $11,$10,$11
sltu $2,$11,$10
sd $11,0($4)
or $2,$2,$8
daddiu $5,$5,8
daddiu $6,$6,8
move $10,$12
move $11,$13
bne $9,$0,.Loop0
daddiu $4,$4,8
.L0: beq $7,$0,.Lend
nop
.Loop: daddiu $7,$7,-4
ld $12,8($5)
daddu $11,$11,$2
ld $13,8($6)
sltu $8,$11,$2
daddu $11,$10,$11
sltu $2,$11,$10
sd $11,0($4)
or $2,$2,$8
ld $10,16($5)
daddu $13,$13,$2
ld $11,16($6)
sltu $8,$13,$2
daddu $13,$12,$13
sltu $2,$13,$12
sd $13,8($4)
or $2,$2,$8
ld $12,24($5)
daddu $11,$11,$2
ld $13,24($6)
sltu $8,$11,$2
daddu $11,$10,$11
sltu $2,$11,$10
sd $11,16($4)
or $2,$2,$8
ld $10,32($5)
daddu $13,$13,$2
ld $11,32($6)
sltu $8,$13,$2
daddu $13,$12,$13
sltu $2,$13,$12
sd $13,24($4)
or $2,$2,$8
daddiu $5,$5,32
daddiu $6,$6,32
bne $7,$0,.Loop
daddiu $4,$4,32
.Lend: daddu $11,$11,$2
sltu $8,$11,$2
daddu $11,$10,$11
sltu $2,$11,$10
sd $11,0($4)
j $31
or $2,$2,$8
.end __mpn_add_n

View File

@ -0,0 +1,96 @@
# MIPS3 __mpn_addmul_1 -- Multiply a limb vector with a single limb and
# add the product to a second limb vector.
# Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc.
# This file is part of the GNU MP Library.
# The GNU MP Library is free software; you can redistribute it and/or modify
# it under the terms of the GNU Library General Public License as published by
# the Free Software Foundation; either version 2 of the License, or (at your
# option) any later version.
# The GNU MP Library is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
# License for more details.
# You should have received a copy of the GNU Library General Public License
# along with the GNU MP Library; see the file COPYING.LIB. If not, write to
# the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
# INPUT PARAMETERS
# res_ptr $4
# s1_ptr $5
# size $6
# s2_limb $7
.text
.align 4
.globl __mpn_addmul_1
.ent __mpn_addmul_1
__mpn_addmul_1:
.set noreorder
.set nomacro
# warm up phase 0
ld $8,0($5)
# warm up phase 1
daddiu $5,$5,8
dmultu $8,$7
daddiu $6,$6,-1
beq $6,$0,$LC0
move $2,$0 # zero cy2
daddiu $6,$6,-1
beq $6,$0,$LC1
ld $8,0($5) # load new s1 limb as early as possible
Loop: ld $10,0($4)
mflo $3
mfhi $9
daddiu $5,$5,8
daddu $3,$3,$2 # add old carry limb to low product limb
dmultu $8,$7
ld $8,0($5) # load new s1 limb as early as possible
daddiu $6,$6,-1 # decrement loop counter
sltu $2,$3,$2 # carry from previous addition -> $2
daddu $3,$10,$3
sltu $10,$3,$10
daddu $2,$2,$10
sd $3,0($4)
daddiu $4,$4,8
bne $6,$0,Loop # should be "bnel"
daddu $2,$9,$2 # add high product limb and carry from addition
# cool down phase 1
$LC1: ld $10,0($4)
mflo $3
mfhi $9
daddu $3,$3,$2
sltu $2,$3,$2
dmultu $8,$7
daddu $3,$10,$3
sltu $10,$3,$10
daddu $2,$2,$10
sd $3,0($4)
daddiu $4,$4,8
daddu $2,$9,$2 # add high product limb and carry from addition
# cool down phase 0
$LC0: ld $10,0($4)
mflo $3
mfhi $9
daddu $3,$3,$2
sltu $2,$3,$2
daddu $3,$10,$3
sltu $10,$3,$10
daddu $2,$2,$10
sd $3,0($4)
j $31
daddu $2,$9,$2 # add high product limb and carry from addition
.end __mpn_addmul_1

View File

@ -0,0 +1,26 @@
/* gmp-mparam.h -- Compiler/machine parameter header file.
Copyright (C) 1991, 1993, 1994 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
it under the terms of the GNU Library General Public License as published by
the Free Software Foundation; either version 2 of the License, or (at your
option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
License for more details.
You should have received a copy of the GNU Library General Public License
along with the GNU MP Library; see the file COPYING.LIB. If not, write to
the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
#define BITS_PER_MP_LIMB 64
#define BYTES_PER_MP_LIMB 8
#define BITS_PER_LONGINT 32
#define BITS_PER_INT 32
#define BITS_PER_SHORTINT 16
#define BITS_PER_CHAR 8

View File

@ -0,0 +1,94 @@
# MIPS3 __mpn_lshift --
# Copyright (C) 1995 Free Software Foundation, Inc.
# This file is part of the GNU MP Library.
# The GNU MP Library is free software; you can redistribute it and/or modify
# it under the terms of the GNU Library General Public License as published by
# the Free Software Foundation; either version 2 of the License, or (at your
# option) any later version.
# The GNU MP Library is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
# License for more details.
# You should have received a copy of the GNU Library General Public License
# along with the GNU MP Library; see the file COPYING.LIB. If not, write to
# the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
# INPUT PARAMETERS
# res_ptr $4
# src_ptr $5
# size $6
# cnt $7
.text
.align 2
.globl __mpn_lshift
.ent __mpn_lshift
__mpn_lshift:
.set noreorder
.set nomacro
dsll $2,$6,3
daddu $5,$5,$2 # make r5 point at end of src
ld $10,-8($5) # load first limb
dsubu $13,$0,$7
daddu $4,$4,$2 # make r4 point at end of res
daddiu $6,$6,-1
and $9,$6,4-1 # number of limbs in first loop
beq $9,$0,.L0 # if multiple of 4 limbs, skip first loop
dsrl $2,$10,$13 # compute function result
dsubu $6,$6,$9
.Loop0: ld $3,-16($5)
daddiu $4,$4,-8
daddiu $5,$5,-8
daddiu $9,$9,-1
dsll $11,$10,$7
dsrl $12,$3,$13
move $10,$3
or $8,$11,$12
bne $9,$0,.Loop0
sd $8,0($4)
.L0: beq $6,$0,.Lend
nop
.Loop: ld $3,-16($5)
daddiu $4,$4,-32
daddiu $6,$6,-4
dsll $11,$10,$7
dsrl $12,$3,$13
ld $10,-24($5)
dsll $14,$3,$7
or $8,$11,$12
sd $8,24($4)
dsrl $9,$10,$13
ld $3,-32($5)
dsll $11,$10,$7
or $8,$14,$9
sd $8,16($4)
dsrl $12,$3,$13
ld $10,-40($5)
dsll $14,$3,$7
or $8,$11,$12
sd $8,8($4)
dsrl $9,$10,$13
daddiu $5,$5,-32
or $8,$14,$9
bgtz $6,.Loop
sd $8,0($4)
.Lend: dsll $8,$10,$7
j $31
sd $8,-8($4)
.end __mpn_lshift

View File

@ -0,0 +1,84 @@
# MIPS3 __mpn_mul_1 -- Multiply a limb vector with a single limb and
# store the product in a second limb vector.
# Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc.
# This file is part of the GNU MP Library.
# The GNU MP Library is free software; you can redistribute it and/or modify
# it under the terms of the GNU Library General Public License as published by
# the Free Software Foundation; either version 2 of the License, or (at your
# option) any later version.
# The GNU MP Library is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
# License for more details.
# You should have received a copy of the GNU Library General Public License
# along with the GNU MP Library; see the file COPYING.LIB. If not, write to
# the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
# INPUT PARAMETERS
# res_ptr $4
# s1_ptr $5
# size $6
# s2_limb $7
.text
.align 4
.globl __mpn_mul_1
.ent __mpn_mul_1
__mpn_mul_1:
.set noreorder
.set nomacro
# warm up phase 0
ld $8,0($5)
# warm up phase 1
daddiu $5,$5,8
dmultu $8,$7
daddiu $6,$6,-1
beq $6,$0,$LC0
move $2,$0 # zero cy2
daddiu $6,$6,-1
beq $6,$0,$LC1
ld $8,0($5) # load new s1 limb as early as possible
Loop: mflo $10
mfhi $9
daddiu $5,$5,8
daddu $10,$10,$2 # add old carry limb to low product limb
dmultu $8,$7
ld $8,0($5) # load new s1 limb as early as possible
daddiu $6,$6,-1 # decrement loop counter
sltu $2,$10,$2 # carry from previous addition -> $2
sd $10,0($4)
daddiu $4,$4,8
bne $6,$0,Loop # should be "bnel"
daddu $2,$9,$2 # add high product limb and carry from addition
# cool down phase 1
$LC1: mflo $10
mfhi $9
daddu $10,$10,$2
sltu $2,$10,$2
dmultu $8,$7
sd $10,0($4)
daddiu $4,$4,8
daddu $2,$9,$2 # add high product limb and carry from addition
# cool down phase 0
$LC0: mflo $10
mfhi $9
daddu $10,$10,$2
sltu $2,$10,$2
sd $10,0($4)
j $31
daddu $2,$9,$2 # add high product limb and carry from addition
.end __mpn_mul_1

View File

@ -0,0 +1,91 @@
# MIPS3 __mpn_rshift --
# Copyright (C) 1995 Free Software Foundation, Inc.
# This file is part of the GNU MP Library.
# The GNU MP Library is free software; you can redistribute it and/or modify
# it under the terms of the GNU Library General Public License as published by
# the Free Software Foundation; either version 2 of the License, or (at your
# option) any later version.
# The GNU MP Library is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
# License for more details.
# You should have received a copy of the GNU Library General Public License
# along with the GNU MP Library; see the file COPYING.LIB. If not, write to
# the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
# INPUT PARAMETERS
# res_ptr $4
# src_ptr $5
# size $6
# cnt $7
.text
.align 2
.globl __mpn_rshift
.ent __mpn_rshift
__mpn_rshift:
.set noreorder
.set nomacro
ld $10,0($5) # load first limb
dsubu $13,$0,$7
daddiu $6,$6,-1
and $9,$6,4-1 # number of limbs in first loop
beq $9,$0,.L0 # if multiple of 4 limbs, skip first loop
dsll $2,$10,$13 # compute function result
dsubu $6,$6,$9
.Loop0: ld $3,8($5)
daddiu $4,$4,8
daddiu $5,$5,8
daddiu $9,$9,-1
dsrl $11,$10,$7
dsll $12,$3,$13
move $10,$3
or $8,$11,$12
bne $9,$0,.Loop0
sd $8,-8($4)
.L0: beq $6,$0,.Lend
nop
.Loop: ld $3,8($5)
daddiu $4,$4,32
daddiu $6,$6,-4
dsrl $11,$10,$7
dsll $12,$3,$13
ld $10,16($5)
dsrl $14,$3,$7
or $8,$11,$12
sd $8,-32($4)
dsll $9,$10,$13
ld $3,24($5)
dsrl $11,$10,$7
or $8,$14,$9
sd $8,-24($4)
dsll $12,$3,$13
ld $10,32($5)
dsrl $14,$3,$7
or $8,$11,$12
sd $8,-16($4)
dsll $9,$10,$13
daddiu $5,$5,32
or $8,$14,$9
bgtz $6,.Loop
sd $8,-8($4)
.Lend: dsrl $8,$10,$7
j $31
sd $8,0($4)
.end __mpn_rshift

119
sysdeps/mips/mips3/sub_n.s Normal file
View File

@ -0,0 +1,119 @@
# MIPS3 __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
# store difference in a third limb vector.
# Copyright (C) 1995 Free Software Foundation, Inc.
# This file is part of the GNU MP Library.
# The GNU MP Library is free software; you can redistribute it and/or modify
# it under the terms of the GNU Library General Public License as published by
# the Free Software Foundation; either version 2 of the License, or (at your
# option) any later version.
# The GNU MP Library is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
# License for more details.
# You should have received a copy of the GNU Library General Public License
# along with the GNU MP Library; see the file COPYING.LIB. If not, write to
# the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
# INPUT PARAMETERS
# res_ptr $4
# s1_ptr $5
# s2_ptr $6
# size $7
.text
.align 2
.globl __mpn_sub_n
.ent __mpn_sub_n
__mpn_sub_n:
.set noreorder
.set nomacro
ld $10,0($5)
ld $11,0($6)
daddiu $7,$7,-1
and $9,$7,4-1 # number of limbs in first loop
beq $9,$0,.L0 # if multiple of 4 limbs, skip first loop
move $2,$0
dsubu $7,$7,$9
.Loop0: daddiu $9,$9,-1
ld $12,8($5)
daddu $11,$11,$2
ld $13,8($6)
sltu $8,$11,$2
dsubu $11,$10,$11
sltu $2,$10,$11
sd $11,0($4)
or $2,$2,$8
daddiu $5,$5,8
daddiu $6,$6,8
move $10,$12
move $11,$13
bne $9,$0,.Loop0
daddiu $4,$4,8
.L0: beq $7,$0,.Lend
nop
.Loop: daddiu $7,$7,-4
ld $12,8($5)
daddu $11,$11,$2
ld $13,8($6)
sltu $8,$11,$2
dsubu $11,$10,$11
sltu $2,$10,$11
sd $11,0($4)
or $2,$2,$8
ld $10,16($5)
daddu $13,$13,$2
ld $11,16($6)
sltu $8,$13,$2
dsubu $13,$12,$13
sltu $2,$12,$13
sd $13,8($4)
or $2,$2,$8
ld $12,24($5)
daddu $11,$11,$2
ld $13,24($6)
sltu $8,$11,$2
dsubu $11,$10,$11
sltu $2,$10,$11
sd $11,16($4)
or $2,$2,$8
ld $10,32($5)
daddu $13,$13,$2
ld $11,32($6)
sltu $8,$13,$2
dsubu $13,$12,$13
sltu $2,$12,$13
sd $13,24($4)
or $2,$2,$8
daddiu $5,$5,32
daddiu $6,$6,32
bne $7,$0,.Loop
daddiu $4,$4,32
.Lend: daddu $11,$11,$2
sltu $8,$11,$2
dsubu $11,$10,$11
sltu $2,$10,$11
sd $11,0($4)
j $31
or $2,$2,$8
.end __mpn_sub_n

View File

@ -0,0 +1,96 @@
# MIPS3 __mpn_submul_1 -- Multiply a limb vector with a single limb and
# subtract the product from a second limb vector.
# Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc.
# This file is part of the GNU MP Library.
# The GNU MP Library is free software; you can redistribute it and/or modify
# it under the terms of the GNU Library General Public License as published by
# the Free Software Foundation; either version 2 of the License, or (at your
# option) any later version.
# The GNU MP Library is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
# License for more details.
# You should have received a copy of the GNU Library General Public License
# along with the GNU MP Library; see the file COPYING.LIB. If not, write to
# the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
# INPUT PARAMETERS
# res_ptr $4
# s1_ptr $5
# size $6
# s2_limb $7
.text
.align 4
.globl __mpn_submul_1
.ent __mpn_submul_1
__mpn_submul_1:
.set noreorder
.set nomacro
# warm up phase 0
ld $8,0($5)
# warm up phase 1
daddiu $5,$5,8
dmultu $8,$7
daddiu $6,$6,-1
beq $6,$0,$LC0
move $2,$0 # zero cy2
daddiu $6,$6,-1
beq $6,$0,$LC1
ld $8,0($5) # load new s1 limb as early as possible
Loop: ld $10,0($4)
mflo $3
mfhi $9
daddiu $5,$5,8
daddu $3,$3,$2 # add old carry limb to low product limb
dmultu $8,$7
ld $8,0($5) # load new s1 limb as early as possible
daddiu $6,$6,-1 # decrement loop counter
sltu $2,$3,$2 # carry from previous addition -> $2
dsubu $3,$10,$3
sgtu $10,$3,$10
daddu $2,$2,$10
sd $3,0($4)
daddiu $4,$4,8
bne $6,$0,Loop # should be "bnel"
daddu $2,$9,$2 # add high product limb and carry from addition
# cool down phase 1
$LC1: ld $10,0($4)
mflo $3
mfhi $9
daddu $3,$3,$2
sltu $2,$3,$2
dmultu $8,$7
dsubu $3,$10,$3
sgtu $10,$3,$10
daddu $2,$2,$10
sd $3,0($4)
daddiu $4,$4,8
daddu $2,$9,$2 # add high product limb and carry from addition
# cool down phase 0
$LC0: ld $10,0($4)
mflo $3
mfhi $9
daddu $3,$3,$2
sltu $2,$3,$2
dsubu $3,$10,$3
sgtu $10,$3,$10
daddu $2,$2,$10
sd $3,0($4)
j $31
daddu $2,$9,$2 # add high product limb and carry from addition
.end __mpn_submul_1

84
sysdeps/mips/mul_1.s Normal file
View File

@ -0,0 +1,84 @@
# MIPS __mpn_mul_1 -- Multiply a limb vector with a single limb and
# store the product in a second limb vector.
# Copyright (C) 1992, 1994 Free Software Foundation, Inc.
# This file is part of the GNU MP Library.
# The GNU MP Library is free software; you can redistribute it and/or modify
# it under the terms of the GNU Library General Public License as published by
# the Free Software Foundation; either version 2 of the License, or (at your
# option) any later version.
# The GNU MP Library is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
# License for more details.
# You should have received a copy of the GNU Library General Public License
# along with the GNU MP Library; see the file COPYING.LIB. If not, write to
# the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
# INPUT PARAMETERS
# res_ptr $4
# s1_ptr $5
# size $6
# s2_limb $7
.text
.align 4
.globl __mpn_mul_1
.ent __mpn_mul_1
__mpn_mul_1:
.set noreorder
.set nomacro
# warm up phase 0
lw $8,0($5)
# warm up phase 1
addiu $5,$5,4
multu $8,$7
addiu $6,$6,-1
beq $6,$0,$LC0
move $2,$0 # zero cy2
addiu $6,$6,-1
beq $6,$0,$LC1
lw $8,0($5) # load new s1 limb as early as possible
Loop: mflo $10
mfhi $9
addiu $5,$5,4
addu $10,$10,$2 # add old carry limb to low product limb
multu $8,$7
lw $8,0($5) # load new s1 limb as early as possible
addiu $6,$6,-1 # decrement loop counter
sltu $2,$10,$2 # carry from previous addition -> $2
sw $10,0($4)
addiu $4,$4,4
bne $6,$0,Loop # should be "bnel"
addu $2,$9,$2 # add high product limb and carry from addition
# cool down phase 1
$LC1: mflo $10
mfhi $9
addu $10,$10,$2
sltu $2,$10,$2
multu $8,$7
sw $10,0($4)
addiu $4,$4,4
addu $2,$9,$2 # add high product limb and carry from addition
# cool down phase 0
$LC0: mflo $10
mfhi $9
addu $10,$10,$2
sltu $2,$10,$2
sw $10,0($4)
j $31
addu $2,$9,$2 # add high product limb and carry from addition
.end __mpn_mul_1

91
sysdeps/mips/rshift.s Normal file
View File

@ -0,0 +1,91 @@
# MIPS2 __mpn_rshift --
# Copyright (C) 1995 Free Software Foundation, Inc.
# This file is part of the GNU MP Library.
# The GNU MP Library is free software; you can redistribute it and/or modify
# it under the terms of the GNU Library General Public License as published by
# the Free Software Foundation; either version 2 of the License, or (at your
# option) any later version.
# The GNU MP Library is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
# License for more details.
# You should have received a copy of the GNU Library General Public License
# along with the GNU MP Library; see the file COPYING.LIB. If not, write to
# the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
# INPUT PARAMETERS
# res_ptr $4
# src_ptr $5
# size $6
# cnt $7
.text
.align 2
.globl __mpn_rshift
.ent __mpn_rshift
__mpn_rshift:
.set noreorder
.set nomacro
lw $10,0($5) # load first limb
subu $13,$0,$7
addiu $6,$6,-1
and $9,$6,4-1 # number of limbs in first loop
beq $9,$0,.L0 # if multiple of 4 limbs, skip first loop
sll $2,$10,$13 # compute function result
subu $6,$6,$9
.Loop0: lw $3,4($5)
addiu $4,$4,4
addiu $5,$5,4
addiu $9,$9,-1
srl $11,$10,$7
sll $12,$3,$13
move $10,$3
or $8,$11,$12
bne $9,$0,.Loop0
sw $8,-4($4)
.L0: beq $6,$0,.Lend
nop
.Loop: lw $3,4($5)
addiu $4,$4,16
addiu $6,$6,-4
srl $11,$10,$7
sll $12,$3,$13
lw $10,8($5)
srl $14,$3,$7
or $8,$11,$12
sw $8,-16($4)
sll $9,$10,$13
lw $3,12($5)
srl $11,$10,$7
or $8,$14,$9
sw $8,-12($4)
sll $12,$3,$13
lw $10,16($5)
srl $14,$3,$7
or $8,$11,$12
sw $8,-8($4)
sll $9,$10,$13
addiu $5,$5,16
or $8,$14,$9
bgtz $6,.Loop
sw $8,-4($4)
.Lend: srl $8,$10,$7
j $31
sw $8,0($4)
.end __mpn_rshift

119
sysdeps/mips/sub_n.s Normal file
View File

@ -0,0 +1,119 @@
# MIPS2 __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
# store difference in a third limb vector.
# Copyright (C) 1995 Free Software Foundation, Inc.
# This file is part of the GNU MP Library.
# The GNU MP Library is free software; you can redistribute it and/or modify
# it under the terms of the GNU Library General Public License as published by
# the Free Software Foundation; either version 2 of the License, or (at your
# option) any later version.
# The GNU MP Library is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
# License for more details.
# You should have received a copy of the GNU Library General Public License
# along with the GNU MP Library; see the file COPYING.LIB. If not, write to
# the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
# INPUT PARAMETERS
# res_ptr $4
# s1_ptr $5
# s2_ptr $6
# size $7
.text
.align 2
.globl __mpn_sub_n
.ent __mpn_sub_n
__mpn_sub_n:
.set noreorder
.set nomacro
lw $10,0($5)
lw $11,0($6)
addiu $7,$7,-1
and $9,$7,4-1 # number of limbs in first loop
beq $9,$0,.L0 # if multiple of 4 limbs, skip first loop
move $2,$0
subu $7,$7,$9
.Loop0: addiu $9,$9,-1
lw $12,4($5)
addu $11,$11,$2
lw $13,4($6)
sltu $8,$11,$2
subu $11,$10,$11
sltu $2,$10,$11
sw $11,0($4)
or $2,$2,$8
addiu $5,$5,4
addiu $6,$6,4
move $10,$12
move $11,$13
bne $9,$0,.Loop0
addiu $4,$4,4
.L0: beq $7,$0,.Lend
nop
.Loop: addiu $7,$7,-4
lw $12,4($5)
addu $11,$11,$2
lw $13,4($6)
sltu $8,$11,$2
subu $11,$10,$11
sltu $2,$10,$11
sw $11,0($4)
or $2,$2,$8
lw $10,8($5)
addu $13,$13,$2
lw $11,8($6)
sltu $8,$13,$2
subu $13,$12,$13
sltu $2,$12,$13
sw $13,4($4)
or $2,$2,$8
lw $12,12($5)
addu $11,$11,$2
lw $13,12($6)
sltu $8,$11,$2
subu $11,$10,$11
sltu $2,$10,$11
sw $11,8($4)
or $2,$2,$8
lw $10,16($5)
addu $13,$13,$2
lw $11,16($6)
sltu $8,$13,$2
subu $13,$12,$13
sltu $2,$12,$13
sw $13,12($4)
or $2,$2,$8
addiu $5,$5,16
addiu $6,$6,16
bne $7,$0,.Loop
addiu $4,$4,16
.Lend: addu $11,$11,$2
sltu $8,$11,$2
subu $11,$10,$11
sltu $2,$10,$11
sw $11,0($4)
j $31
or $2,$2,$8
.end __mpn_sub_n

96
sysdeps/mips/submul_1.s Normal file
View File

@ -0,0 +1,96 @@
# MIPS __mpn_submul_1 -- Multiply a limb vector with a single limb and
# subtract the product from a second limb vector.
# Copyright (C) 1992, 1994 Free Software Foundation, Inc.
# This file is part of the GNU MP Library.
# The GNU MP Library is free software; you can redistribute it and/or modify
# it under the terms of the GNU Library General Public License as published by
# the Free Software Foundation; either version 2 of the License, or (at your
# option) any later version.
# The GNU MP Library is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
# License for more details.
# You should have received a copy of the GNU Library General Public License
# along with the GNU MP Library; see the file COPYING.LIB. If not, write to
# the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
# INPUT PARAMETERS
# res_ptr $4
# s1_ptr $5
# size $6
# s2_limb $7
.text
.align 4
.globl __mpn_submul_1
.ent __mpn_submul_1
__mpn_submul_1:
.set noreorder
.set nomacro
# warm up phase 0
lw $8,0($5)
# warm up phase 1
addiu $5,$5,4
multu $8,$7
addiu $6,$6,-1
beq $6,$0,$LC0
move $2,$0 # zero cy2
addiu $6,$6,-1
beq $6,$0,$LC1
lw $8,0($5) # load new s1 limb as early as possible
Loop: lw $10,0($4)
mflo $3
mfhi $9
addiu $5,$5,4
addu $3,$3,$2 # add old carry limb to low product limb
multu $8,$7
lw $8,0($5) # load new s1 limb as early as possible
addiu $6,$6,-1 # decrement loop counter
sltu $2,$3,$2 # carry from previous addition -> $2
subu $3,$10,$3
sgtu $10,$3,$10
addu $2,$2,$10
sw $3,0($4)
addiu $4,$4,4
bne $6,$0,Loop # should be "bnel"
addu $2,$9,$2 # add high product limb and carry from addition
# cool down phase 1
$LC1: lw $10,0($4)
mflo $3
mfhi $9
addu $3,$3,$2
sltu $2,$3,$2
multu $8,$7
subu $3,$10,$3
sgtu $10,$3,$10
addu $2,$2,$10
sw $3,0($4)
addiu $4,$4,4
addu $2,$9,$2 # add high product limb and carry from addition
# cool down phase 0
$LC0: lw $10,0($4)
mflo $3
mfhi $9
addu $3,$3,$2
sltu $2,$3,$2
subu $3,$10,$3
sgtu $10,$3,$10
addu $2,$2,$10
sw $3,0($4)
j $31
addu $2,$9,$2 # add high product limb and carry from addition
.end __mpn_submul_1

54
sysdeps/rs6000/add_n.s Normal file
View File

@ -0,0 +1,54 @@
# IBM POWER __mpn_add_n -- Add two limb vectors of equal, non-zero length.
# Copyright (C) 1992, 1994 Free Software Foundation, Inc.
# This file is part of the GNU MP Library.
# The GNU MP Library is free software; you can redistribute it and/or modify
# it under the terms of the GNU Library General Public License as published by
# the Free Software Foundation; either version 2 of the License, or (at your
# option) any later version.
# The GNU MP Library is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
# License for more details.
# You should have received a copy of the GNU Library General Public License
# along with the GNU MP Library; see the file COPYING.LIB. If not, write to
# the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
# INPUT PARAMETERS
# res_ptr r3
# s1_ptr r4
# s2_ptr r5
# size r6
.toc
.extern __mpn_add_n[DS]
.extern .__mpn_add_n
.csect [PR]
.align 2
.globl __mpn_add_n
.globl .__mpn_add_n
.csect __mpn_add_n[DS]
__mpn_add_n:
.long .__mpn_add_n, TOC[tc0], 0
.csect [PR]
.__mpn_add_n:
mtctr 6 # copy size into CTR
l 8,0(4) # load least significant s1 limb
l 0,0(5) # load least significant s2 limb
cal 3,-4(3) # offset res_ptr, it's updated before used
a 7,0,8 # add least significant limbs, set cy
bdz Lend # If done, skip loop
Loop: lu 8,4(4) # load s1 limb and update s1_ptr
lu 0,4(5) # load s2 limb and update s2_ptr
stu 7,4(3) # store previous limb in load latecny slot
ae 7,0,8 # add new limbs with cy, set cy
bdn Loop # decrement CTR and loop back
Lend: st 7,4(3) # store ultimate result limb
lil 3,0 # load cy into ...
aze 3,3 # ... return value register
br

122
sysdeps/rs6000/addmul_1.s Normal file
View File

@ -0,0 +1,122 @@
# IBM POWER __mpn_addmul_1 -- Multiply a limb vector with a limb and add
# the result to a second limb vector.
# Copyright (C) 1992, 1994 Free Software Foundation, Inc.
# This file is part of the GNU MP Library.
# The GNU MP Library is free software; you can redistribute it and/or modify
# it under the terms of the GNU Library General Public License as published by
# the Free Software Foundation; either version 2 of the License, or (at your
# option) any later version.
# The GNU MP Library is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
# License for more details.
# You should have received a copy of the GNU Library General Public License
# along with the GNU MP Library; see the file COPYING.LIB. If not, write to
# the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
# INPUT PARAMETERS
# res_ptr r3
# s1_ptr r4
# size r5
# s2_limb r6
# The RS/6000 has no unsigned 32x32->64 bit multiplication instruction. To
# obtain that operation, we have to use the 32x32->64 signed multiplication
# instruction, and add the appropriate compensation to the high limb of the
# result. We add the multiplicand if the multiplier has its most significant
# bit set, and we add the multiplier if the multiplicand has its most
# significant bit set. We need to preserve the carry flag between each
# iteration, so we have to compute the compensation carefully (the natural,
# srai+and doesn't work). Since the POWER architecture has a branch unit
# we can branch in zero cycles, so that's how we perform the additions.
.toc
.csect .__mpn_addmul_1[PR]
.align 2
.globl __mpn_addmul_1
.globl .__mpn_addmul_1
.csect __mpn_addmul_1[DS]
__mpn_addmul_1:
.long .__mpn_addmul_1[PR], TOC[tc0], 0
.csect .__mpn_addmul_1[PR]
.__mpn_addmul_1:
cal 3,-4(3)
l 0,0(4)
cmpi 0,6,0
mtctr 5
mul 9,0,6
srai 7,0,31
and 7,7,6
mfmq 8
cax 9,9,7
l 7,4(3)
a 8,8,7 # add res_limb
blt Lneg
Lpos: bdz Lend
Lploop: lu 0,4(4)
stu 8,4(3)
cmpi 0,0,0
mul 10,0,6
mfmq 0
ae 8,0,9 # low limb + old_cy_limb + old cy
l 7,4(3)
aze 10,10 # propagate cy to new cy_limb
a 8,8,7 # add res_limb
bge Lp0
cax 10,10,6 # adjust high limb for negative limb from s1
Lp0: bdz Lend0
lu 0,4(4)
stu 8,4(3)
cmpi 0,0,0
mul 9,0,6
mfmq 0
ae 8,0,10
l 7,4(3)
aze 9,9
a 8,8,7
bge Lp1
cax 9,9,6 # adjust high limb for negative limb from s1
Lp1: bdn Lploop
b Lend
Lneg: cax 9,9,0
bdz Lend
Lnloop: lu 0,4(4)
stu 8,4(3)
cmpi 0,0,0
mul 10,0,6
mfmq 7
ae 8,7,9
l 7,4(3)
ae 10,10,0 # propagate cy to new cy_limb
a 8,8,7 # add res_limb
bge Ln0
cax 10,10,6 # adjust high limb for negative limb from s1
Ln0: bdz Lend0
lu 0,4(4)
stu 8,4(3)
cmpi 0,0,0
mul 9,0,6
mfmq 7
ae 8,7,10
l 7,4(3)
ae 9,9,0 # propagate cy to new cy_limb
a 8,8,7 # add res_limb
bge Ln1
cax 9,9,6 # adjust high limb for negative limb from s1
Ln1: bdn Lnloop
b Lend
Lend0: cal 9,0(10)
Lend: st 8,4(3)
aze 3,9
br

58
sysdeps/rs6000/lshift.s Normal file
View File

@ -0,0 +1,58 @@
# IBM POWER __mpn_lshift --
# Copyright (C) 1992, 1994 Free Software Foundation, Inc.
# This file is part of the GNU MP Library.
# The GNU MP Library is free software; you can redistribute it and/or modify
# it under the terms of the GNU Library General Public License as published by
# the Free Software Foundation; either version 2 of the License, or (at your
# option) any later version.
# The GNU MP Library is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
# License for more details.
# You should have received a copy of the GNU Library General Public License
# along with the GNU MP Library; see the file COPYING.LIB. If not, write to
# the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
# INPUT PARAMETERS
# res_ptr r3
# s_ptr r4
# size r5
# cnt r6
.toc
.extern __mpn_lshift[DS]
.extern .__mpn_lshift
.csect [PR]
.align 2
.globl __mpn_lshift
.globl .__mpn_lshift
.csect __mpn_lshift[DS]
__mpn_lshift:
.long .__mpn_lshift, TOC[tc0], 0
.csect [PR]
.__mpn_lshift:
sli 0,5,2
cax 9,3,0
cax 4,4,0
sfi 8,6,32
mtctr 5 # put limb count in CTR loop register
lu 0,-4(4) # read most significant limb
sre 3,0,8 # compute carry out limb, and init MQ register
bdz Lend2 # if just one limb, skip loop
lu 0,-4(4) # read 2:nd most significant limb
sreq 7,0,8 # compute most significant limb of result
bdz Lend # if just two limb, skip loop
Loop: lu 0,-4(4) # load next lower limb
stu 7,-4(9) # store previous result during read latency
sreq 7,0,8 # compute result limb
bdn Loop # loop back until CTR is zero
Lend: stu 7,-4(9) # store 2:nd least significant limb
Lend2: sle 7,0,6 # compute least significant limb
st 7,-4(9) # store it" \
br

109
sysdeps/rs6000/mul_1.s Normal file
View File

@ -0,0 +1,109 @@
# IBM POWER __mpn_mul_1 -- Multiply a limb vector with a limb and store
# the result in a second limb vector.
# Copyright (C) 1992, 1994 Free Software Foundation, Inc.
# This file is part of the GNU MP Library.
# The GNU MP Library is free software; you can redistribute it and/or modify
# it under the terms of the GNU Library General Public License as published by
# the Free Software Foundation; either version 2 of the License, or (at your
# option) any later version.
# The GNU MP Library is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
# License for more details.
# You should have received a copy of the GNU Library General Public License
# along with the GNU MP Library; see the file COPYING.LIB. If not, write to
# the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
# INPUT PARAMETERS
# res_ptr r3
# s1_ptr r4
# size r5
# s2_limb r6
# The RS/6000 has no unsigned 32x32->64 bit multiplication instruction. To
# obtain that operation, we have to use the 32x32->64 signed multiplication
# instruction, and add the appropriate compensation to the high limb of the
# result. We add the multiplicand if the multiplier has its most significant
# bit set, and we add the multiplier if the multiplicand has its most
# significant bit set. We need to preserve the carry flag between each
# iteration, so we have to compute the compensation carefully (the natural,
# srai+and doesn't work). Since the POWER architecture has a branch unit
# we can branch in zero cycles, so that's how we perform the additions.
.toc
.csect .__mpn_mul_1[PR]
.align 2
.globl __mpn_mul_1
.globl .__mpn_mul_1
.csect __mpn_mul_1[DS]
__mpn_mul_1:
.long .__mpn_mul_1[PR], TOC[tc0], 0
.csect .__mpn_mul_1[PR]
.__mpn_mul_1:
cal 3,-4(3)
l 0,0(4)
cmpi 0,6,0
mtctr 5
mul 9,0,6
srai 7,0,31
and 7,7,6
mfmq 8
ai 0,0,0 # reset carry
cax 9,9,7
blt Lneg
Lpos: bdz Lend
Lploop: lu 0,4(4)
stu 8,4(3)
cmpi 0,0,0
mul 10,0,6
mfmq 0
ae 8,0,9
bge Lp0
cax 10,10,6 # adjust high limb for negative limb from s1
Lp0: bdz Lend0
lu 0,4(4)
stu 8,4(3)
cmpi 0,0,0
mul 9,0,6
mfmq 0
ae 8,0,10
bge Lp1
cax 9,9,6 # adjust high limb for negative limb from s1
Lp1: bdn Lploop
b Lend
Lneg: cax 9,9,0
bdz Lend
Lnloop: lu 0,4(4)
stu 8,4(3)
cmpi 0,0,0
mul 10,0,6
cax 10,10,0 # adjust high limb for negative s2_limb
mfmq 0
ae 8,0,9
bge Ln0
cax 10,10,6 # adjust high limb for negative limb from s1
Ln0: bdz Lend0
lu 0,4(4)
stu 8,4(3)
cmpi 0,0,0
mul 9,0,6
cax 9,9,0 # adjust high limb for negative s2_limb
mfmq 0
ae 8,0,10
bge Ln1
cax 9,9,6 # adjust high limb for negative limb from s1
Ln1: bdn Lnloop
b Lend
Lend0: cal 9,0(10)
Lend: st 8,4(3)
aze 3,9
br

56
sysdeps/rs6000/rshift.s Normal file
View File

@ -0,0 +1,56 @@
# IBM POWER __mpn_rshift --
# Copyright (C) 1992, 1994 Free Software Foundation, Inc.
# This file is part of the GNU MP Library.
# The GNU MP Library is free software; you can redistribute it and/or modify
# it under the terms of the GNU Library General Public License as published by
# the Free Software Foundation; either version 2 of the License, or (at your
# option) any later version.
# The GNU MP Library is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
# License for more details.
# You should have received a copy of the GNU Library General Public License
# along with the GNU MP Library; see the file COPYING.LIB. If not, write to
# the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
# INPUT PARAMETERS
# res_ptr r3
# s_ptr r4
# size r5
# cnt r6
.toc
.extern __mpn_rshift[DS]
.extern .__mpn_rshift
.csect [PR]
.align 2
.globl __mpn_rshift
.globl .__mpn_rshift
.csect __mpn_rshift[DS]
__mpn_rshift:
.long .__mpn_rshift, TOC[tc0], 0
.csect [PR]
.__mpn_rshift:
sfi 8,6,32
mtctr 5 # put limb count in CTR loop register
l 0,0(4) # read least significant limb
ai 9,3,-4 # adjust res_ptr since it's offset in the stu:s
sle 3,0,8 # compute carry limb, and init MQ register
bdz Lend2 # if just one limb, skip loop
lu 0,4(4) # read 2:nd least significant limb
sleq 7,0,8 # compute least significant limb of result
bdz Lend # if just two limb, skip loop
Loop: lu 0,4(4) # load next higher limb
stu 7,4(9) # store previous result during read latency
sleq 7,0,8 # compute result limb
bdn Loop # loop back until CTR is zero
Lend: stu 7,4(9) # store 2:nd most significant limb
Lend2: sre 7,0,6 # compute most significant limb
st 7,4(9) # store it" \
br

55
sysdeps/rs6000/sub_n.s Normal file
View File

@ -0,0 +1,55 @@
# IBM POWER __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
# store difference in a third limb vector.
# Copyright (C) 1992, 1994 Free Software Foundation, Inc.
# This file is part of the GNU MP Library.
# The GNU MP Library is free software; you can redistribute it and/or modify
# it under the terms of the GNU Library General Public License as published by
# the Free Software Foundation; either version 2 of the License, or (at your
# option) any later version.
# The GNU MP Library is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
# License for more details.
# You should have received a copy of the GNU Library General Public License
# along with the GNU MP Library; see the file COPYING.LIB. If not, write to
# the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
# INPUT PARAMETERS
# res_ptr r3
# s1_ptr r4
# s2_ptr r5
# size r6
.toc
.extern __mpn_sub_n[DS]
.extern .__mpn_sub_n
.csect [PR]
.align 2
.globl __mpn_sub_n
.globl .__mpn_sub_n
.csect __mpn_sub_n[DS]
__mpn_sub_n:
.long .__mpn_sub_n, TOC[tc0], 0
.csect [PR]
.__mpn_sub_n:
mtctr 6 # copy size into CTR
l 8,0(4) # load least significant s1 limb
l 0,0(5) # load least significant s2 limb
cal 3,-4(3) # offset res_ptr, it's updated before used
sf 7,0,8 # add least significant limbs, set cy
bdz Lend # If done, skip loop
Loop: lu 8,4(4) # load s1 limb and update s1_ptr
lu 0,4(5) # load s2 limb and update s2_ptr
stu 7,4(3) # store previous limb in load latecny slot
sfe 7,0,8 # add new limbs with cy, set cy
bdn Loop # decrement CTR and loop back
Lend: st 7,4(3) # store ultimate result limb
sfe 3,0,0 # load !cy into ...
sfi 3,3,0 # ... return value register
br

127
sysdeps/rs6000/submul_1.s Normal file
View File

@ -0,0 +1,127 @@
# IBM POWER __mpn_submul_1 -- Multiply a limb vector with a limb and subtract
# the result from a second limb vector.
# Copyright (C) 1992, 1994 Free Software Foundation, Inc.
# This file is part of the GNU MP Library.
# The GNU MP Library is free software; you can redistribute it and/or modify
# it under the terms of the GNU Library General Public License as published by
# the Free Software Foundation; either version 2 of the License, or (at your
# option) any later version.
# The GNU MP Library is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
# License for more details.
# You should have received a copy of the GNU Library General Public License
# along with the GNU MP Library; see the file COPYING.LIB. If not, write to
# the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
# INPUT PARAMETERS
# res_ptr r3
# s1_ptr r4
# size r5
# s2_limb r6
# The RS/6000 has no unsigned 32x32->64 bit multiplication instruction. To
# obtain that operation, we have to use the 32x32->64 signed multiplication
# instruction, and add the appropriate compensation to the high limb of the
# result. We add the multiplicand if the multiplier has its most significant
# bit set, and we add the multiplier if the multiplicand has its most
# significant bit set. We need to preserve the carry flag between each
# iteration, so we have to compute the compensation carefully (the natural,
# srai+and doesn't work). Since the POWER architecture has a branch unit
# we can branch in zero cycles, so that's how we perform the additions.
.toc
.csect .__mpn_submul_1[PR]
.align 2
.globl __mpn_submul_1
.globl .__mpn_submul_1
.csect __mpn_submul_1[DS]
__mpn_submul_1:
.long .__mpn_submul_1[PR], TOC[tc0], 0
.csect .__mpn_submul_1[PR]
.__mpn_submul_1:
cal 3,-4(3)
l 0,0(4)
cmpi 0,6,0
mtctr 5
mul 9,0,6
srai 7,0,31
and 7,7,6
mfmq 11
cax 9,9,7
l 7,4(3)
sf 8,11,7 # add res_limb
a 11,8,11 # invert cy (r11 is junk)
blt Lneg
Lpos: bdz Lend
Lploop: lu 0,4(4)
stu 8,4(3)
cmpi 0,0,0
mul 10,0,6
mfmq 0
ae 11,0,9 # low limb + old_cy_limb + old cy
l 7,4(3)
aze 10,10 # propagate cy to new cy_limb
sf 8,11,7 # add res_limb
a 11,8,11 # invert cy (r11 is junk)
bge Lp0
cax 10,10,6 # adjust high limb for negative limb from s1
Lp0: bdz Lend0
lu 0,4(4)
stu 8,4(3)
cmpi 0,0,0
mul 9,0,6
mfmq 0
ae 11,0,10
l 7,4(3)
aze 9,9
sf 8,11,7
a 11,8,11 # invert cy (r11 is junk)
bge Lp1
cax 9,9,6 # adjust high limb for negative limb from s1
Lp1: bdn Lploop
b Lend
Lneg: cax 9,9,0
bdz Lend
Lnloop: lu 0,4(4)
stu 8,4(3)
cmpi 0,0,0
mul 10,0,6
mfmq 7
ae 11,7,9
l 7,4(3)
ae 10,10,0 # propagate cy to new cy_limb
sf 8,11,7 # add res_limb
a 11,8,11 # invert cy (r11 is junk)
bge Ln0
cax 10,10,6 # adjust high limb for negative limb from s1
Ln0: bdz Lend0
lu 0,4(4)
stu 8,4(3)
cmpi 0,0,0
mul 9,0,6
mfmq 7
ae 11,7,10
l 7,4(3)
ae 9,9,0 # propagate cy to new cy_limb
sf 8,11,7 # add res_limb
a 11,8,11 # invert cy (r11 is junk)
bge Ln1
cax 9,9,6 # adjust high limb for negative limb from s1
Ln1: bdn Lnloop
b Lend
Lend0: cal 9,0(10)
Lend: st 8,4(3)
aze 3,9
br

View File

@ -1,7 +1,7 @@
! sparc __mpn_add_n -- Add two limb vectors of the same length > 0 and store
! sum in a third limb vector.
! Copyright (C) 1992, 1994 Free Software Foundation, Inc.
! Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc.
! This file is part of the GNU MP Library.
@ -39,20 +39,25 @@ C_SYMBOL_NAME(__mpn_add_n):
sub %g0,%o3,%o3
andcc %o3,(16-1),%o3
be Lzero
nop
mov %o4,%g2 ! put first s1_limb in g2 too
sll %o3,2,%o3 ! multiply by 4
sub %o0,%o3,%o0 ! adjust res_ptr
sub %o1,%o3,%o1 ! adjust s1_ptr
sub %o2,%o3,%o2 ! adjust s2_ptr
mov %o4,%g2
#if PIC
mov %o7,%g4 ! Save return address register
call 1f
add %o7,Lbase-1f,%g3
1: mov %g4,%o7 ! Restore return address register
#else
sethi %hi(Lbase),%g3
or %g3,%lo(Lbase),%g3
#endif
sll %o3,2,%o3 ! multiply by 4
jmp %g3+%o3
mov %o5,%g3
mov %o5,%g3 ! put first s2_limb in g3 too
Loop: addxcc %g2,%g3,%o3
add %o1,64,%o1

View File

@ -37,8 +37,15 @@ C_SYMBOL_NAME(__mpn_addmul_1):
sll %o2,4,%g1
and %g1,(4-1)<<4,%g1
#if PIC
mov %o7,%g4 ! Save return address register
call 1f
add %o7,LL-1f,%g3
1: mov %g4,%o7 ! Restore return address register
#else
sethi %hi(LL),%g3
or %g3,%lo(LL),%g3
#endif
jmp %g3+%g1
nop
LL:

View File

@ -34,8 +34,15 @@
C_SYMBOL_NAME(__mpn_mul_1):
sll %o2,4,%g1
and %g1,(4-1)<<4,%g1
#if PIC
mov %o7,%g4 ! Save return address register
call 1f
add %o7,LL-1f,%g3
1: mov %g4,%o7 ! Restore return address register
#else
sethi %hi(LL),%g3
or %g3,%lo(LL),%g3
#endif
jmp %g3+%g1
ld [%o1+0],%o4 ! 1
LL:

View File

@ -1,7 +1,7 @@
! sparc __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
! store difference in a third limb vector.
! Copyright (C) 1992, 1994 Free Software Foundation, Inc.
! Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc.
! This file is part of the GNU MP Library.
@ -39,20 +39,25 @@ C_SYMBOL_NAME(__mpn_sub_n):
sub %g0,%o3,%o3
andcc %o3,(16-1),%o3
be Lzero
nop
mov %o4,%g2 ! put first s1_limb in g2 too
sll %o3,2,%o3 ! multiply by 4
sub %o0,%o3,%o0 ! adjust res_ptr
sub %o1,%o3,%o1 ! adjust s1_ptr
sub %o2,%o3,%o2 ! adjust s2_ptr
mov %o4,%g2
#if PIC
mov %o7,%g4 ! Save return address register
call 1f
add %o7,Lbase-1f,%g3
1: mov %g4,%o7 ! Restore return address register
#else
sethi %hi(Lbase),%g3
or %g3,%lo(Lbase),%g3
#endif
sll %o3,2,%o3 ! multiply by 4
jmp %g3+%o3
mov %o5,%g3
mov %o5,%g3 ! put first s2_limb in g3 too
Loop: subxcc %g2,%g3,%o3
add %o1,64,%o1

View File

@ -1,2 +1,3 @@
sys/socketcall.h
sys/timex.h
nfs/nfs.h

View File

@ -23,4 +23,8 @@ ifeq ($(subdir), socket)
headers += sys/socketcall.h
endif
ifeq ($(subdir), sunrpc)
headers += nfs/nfs.h
endif
config-LDFLAGS = -Wl,-dynamic-linker=/lib/ld-gnu.so.1

View File

@ -93,43 +93,61 @@ Cambridge, MA 02139, USA. */
(2 * movl is less expensive than pushl + popl).
Second unlike for the other registers we don't save the content of
%ecx and %edx when we have than 1 and 2 registers resp. */
%ecx and %edx when we have than 1 and 2 registers resp.
The code below might look a bit long but we have to take care for
the pipelined processors (i586 and up). Here the `pushl' and `popl'
instructions are marked as NP (not pairable) but the exception is
two consecutive of these instruction. This gives no penalty on
i386 and i486 processors though. */
#undef DO_CALL
#define DO_CALL(args) \
PUSHARGS_##args \
DOARGS_##args \
int $0x80; \
UNDOARGS_##args
int $0x80 \
POPARGS_##args
#define PUSHARGS_0 /* No arguments to push. */
#define DOARGS_0 /* No arguments to frob. */
#define UNDOARGS_0 /* No arguments to unfrob. */
#define POPARGS_0 /* No arguments to pop. */
#define _PUSHARGS_0 /* No arguments to push. */
#define _DOARGS_0(n) /* No arguments to frob. */
#define _UNDOARGS_0 /* No arguments to unfrob. */
#define _POPARGS_0 /* No arguments to pop. */
#define DOARGS_1 movl %ebx, %edx; movl 4(%esp), %ebx; DOARGS_0
#define UNDOARGS_1 UNDOARGS_0; movl %edx, %ebx
#define _DOARGS_1(n) pushl %ebx; movl n+4(%esp), %ebx; _DOARGS_0 (n)
#define _UNDOARGS_1 _UNDOARGS_0; popl %ebx
#define PUSHARGS_1 movl %ebx, %edx; PUSHARGS_0
#define DOARGS_1 _DOARGS_1 (4)
#define POPARGS_1 POPARGS_0; movl %edx, %ebx
#define _PUSHARGS_1 pushl %ebx; _PUSHARGS_0
#define _DOARGS_1(n) movl n(%esp), %ebx; _DOARGS_0(n-4)
#define _POPARGS_1 _POPARGS_0; popl %ebx
#define DOARGS_2 movl 8(%esp), %ecx; DOARGS_1
#define UNDOARGS_2 UNDOARGS_1
#define PUSHARGS_2 PUSHARGS_1
#define DOARGS_2 _DOARGS_2 (8)
#define POPARGS_2 POPARGS_1
#define _PUSHARGS_2 _PUSHARGS_1
#define _DOARGS_2(n) movl n(%esp), %ecx; _DOARGS_1 (n-4)
#define _UNDOARGS_2 _UNDOARGS_1
#define _POPARGS_2 _POPARGS_1
#define DOARGS_3 _DOARGS_3 (12)
#define UNDOARGS_3 _UNDOARGS_3
#define PUSHARGS_3 _PUSHARGS_2
#define DOARGS_3 _DOARGS_3 (16)
#define POPARGS_3 _POPARGS_3
#define _PUSHARGS_3 _PUSHARGS_2
#define _DOARGS_3(n) movl n(%esp), %edx; _DOARGS_2 (n-4)
#define _UNDOARGS_3 _UNDOARGS_2
#define _POPARGS_3 _POPARGS_2
#define DOARGS_4 _DOARGS_4 (16)
#define UNDOARGS_4 _UNDOARGS_4
#define _DOARGS_4(n) pushl %esi; movl n+4(%esp), %esi; _DOARGS_3 (n)
#define _UNDOARGS_4 _UNDOARGS_3; popl %esi
#define DOARGS_5 _DOARGS_5 (20)
#define UNDOARGS_5 _UNDOARGS_5
#define _DOARGS_5(n) pushl %edi; movl n+4(%esp), %edi; _DOARGS_4 (n)
#define _UNDOARGS_5 _UNDOARGS_4; popl %edi
#define PUSHARGS_4 _PUSHARGS_4
#define DOARGS_4 _DOARGS_4 (24)
#define POPARGS_4 _POPARGS_4
#define _PUSHARGS_4 pushl %esi; _PUSHARGS_3
#define _DOARGS_4(n) movl n(%esp), %esi; _DOARGS_3 (n-4)
#define _POPARGS_4 _POPARGS_3; popl %esi
#define PUSHARGS_5 _PUSHARGS_5
#define DOARGS_5 _DOARGS_5 (32)
#define POPARGS_5 _POPARGS_5
#define _PUSHARGS_5 pushl %edi; _PUSHARGS_4
#define _DOARGS_5(n) movl n(%esp), %edi; _DOARGS_4 (n-4)
#define _POPARGS_5 _POPARGS_4; popl %edi
#endif /* ASSEMBLER */

View File

@ -1,6 +1,6 @@
/* Minimum guaranteed maximum values for system limits. Hurd version.
/* Minimum guaranteed maximum values for system limits. Linux version.
Copyright (C) 1993, 1994 Free Software Foundation, Inc.
Copyright (C) 1993, 1994, 1995 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
@ -18,14 +18,5 @@ License along with the GNU C Library; see the file COPYING.LIB. If
not, write to the Free Software Foundation, Inc., 675 Mass Ave,
Cambridge, MA 02139, USA. */
/* Linux has a fixed limit of supplementary groups allocated with a
process. This value is determined by the size of the `groups'
member of the `task_struct' structure in <linux/sched.h>. */
#define NGROUPS_MAX 32
/* Maximum size of file names. Not all file system types support
this size but it is only a maximum value. */
#define NAME_MAX 255
/* The kernel sources contain a file with all the needed information. */
#include <linux/limits.h>

View File

@ -0,0 +1 @@
#include <linux/nfs.h>

View File

@ -1,3 +1,21 @@
/* Copyright (C) 1995 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Library General Public License as
published by the Free Software Foundation; either version 2 of the
License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Library General Public License for more details.
You should have received a copy of the GNU Library General Public
License along with the GNU C Library; see the file COPYING.LIB. If
not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
Boston, MA 02111-1307, USA. */
#ifndef _SYS_PARAM_H
#define _SYS_PARAM_H
@ -7,11 +25,6 @@
#include <sys/types.h>
/* Don't change it. H.J. */
#ifdef OLD_LINUX
#undef MAXHOSTNAMELEN
#define MAXHOSTNAMELEN 8 /* max length of hostname */
#endif
#ifndef howmany
# define howmany(x, y) (((x)+((y)-1))/(y))
@ -25,8 +38,8 @@
#define NOFILE OPEN_MAX
/* Following the information of some of the kernel people I here assume
* that block size (i.e. the value of stat.st_blocks) for all filesystem
* is 512 bytes. If not tell me or HJ. -- Uli */
that block size (i.e. the value of stat.st_blocks) for all filesystem
is 512 bytes. If not tell HJ, Roland, or me. -- drepper */
#define DEV_BSIZE 512
#endif

Some files were not shown because too many files have changed in this diff Show More