// Copyright (C)  2000 Intel Corporation.  All rights reserved.
//
// $Header: /usr/development/orp/orp/arch/ia32/ia32_o1_jit/fp_compatibility.cpp,v 1.4 2001/08/13 11:30:15 ywang13 Exp $
//


#include "defines.h"
#include <iostream.h>
#include "code_emitter.h"
#include "stack.h"
#include "operand.h"
#include "fp_compatibility.h"
#include "lazy_code_selector.h"

#define REMAINDER_BIT_POS  0x0400

#ifdef ORP_POSIX
static const uint64 dbl_scale_up[2]   = {0x0000000000000000,0x1FFE - 0x1FF};
static const uint64 dbl_scale_down[2] = {0x0000000000000000,0x0001+0x1FE};
#else
static const uint64 dbl_scale_up[2]   = {0x8000000000000000,0x7FFE-0x3FF};
static const uint64 dbl_scale_down[2] = {0x8000000000000000,0x0001+0x3FE};
#endif

//
//  FPU control status word (16 bits)
//  bit 15-14: Reserved
//  bit 12:    Infinity control
//  bit 11-10: Round control
//  bit 9-8:   Precision control
//  bit 7-6:   Reserved
//  bit 5:     Precision exception mask
//  bit 4:     Underflow exception mask
//  bit 3:     Overflow  exception mask
//  bit 2:     Zero divide exception mask
//  bit 1:     Denormalized operand exception mask
//  bit 0:     Invalid operation exception mask
//
//  Rounding control:   bit 11   10
//                           0    0  ---Round to nearest or even
//                           0    1  ---Round down (toward -infinity)
//                           1    0  ---Round up (toward +infinity)
//                           1    1  ---Chop (truncate toward 0)
//
//  Precision control:  bit  9    8
//                           0    0  ---24 bits (single precision)
//                           0    1  ---(Reserved)
//                           1    0  ---53 bits (double precision)
//                           1    1  ---64 bits (extended precision)
//  
#define CW_BIT9  0x0200
#define CW_BIT8  0x0100

void set_fpu_control_word(Code_Emitter& emitter,
                          Stack&        stack,
                          Pre_Alloc_Operand_Pool& op_pool,
                          bool          is_dbl) {
    return; // VM has already pre-set the control word
    if (!is_dbl) return;
    //
    // The highest entry is reserved for FPU control word
    // 1. save old FPU control word onto the runtime stack
    // 2. set precision control to 53-bits
    // 3. load the status word (with 53-bits precision being set)
    //
    Stack_Operand *old = op_pool.nth_stack(stack.size-1);
    Stack_Operand *loc = op_pool.nth_stack(stack.size-2);
    emitter.emit_fnstcw(old->mem_opnd());
    emitter.emit_fnstcw(loc->mem_opnd());
    emitter.emit_alu(and_opc,loc->mem_opnd(),&Imm_Opnd((~CW_BIT8)&0xFFFF));
    emitter.emit_alu(or_opc,loc->mem_opnd(),&Imm_Opnd(CW_BIT9));
    emitter.emit_fldcw(loc->mem_opnd());
}

void restore_old_fpu_control_word(Code_Emitter& emitter,
                                 Stack&        stack,
                                 Pre_Alloc_Operand_Pool& op_pool,
                                 bool          is_dbl) {
    return; // VM has already pre-set the control word
    if (!is_dbl) return;
    //
    // The highest entry is reserved for FPU control word
    // 1. load the old FPU status word 
    //
    Stack_Operand *old = op_pool.nth_stack(stack.size-1);
    emitter.emit_fldcw(old->mem_opnd());
}

void fp_strict_op(Code_Emitter& emitter,
                  X86_FP_Opcode opc,
                  Mem_Operand   *m_src1,
                  Mem_Operand   *m_src2,
                  bool          is_dbl) 
{
    if (!is_dbl || opc == fadd_opc || opc == fsub_opc || opc == fsubr_opc)
    {
        emitter.emit_fld(m_src1->mem_opnd(),is_dbl);
        emitter.emit_fp_op_mem(opc,m_src2->mem_opnd(),is_dbl);
    }
    else if (opc == fmul_opc || opc == fdiv_opc)
    {
        emitter.emit_fld80(&M_Opnd((unsigned)dbl_scale_up));
        emitter.emit_fld80(&M_Opnd((unsigned)dbl_scale_down));
        emitter.emit_fp_op_mem(fmul_opc,m_src1->mem_opnd(),is_dbl);
        emitter.emit_fp_op_mem(opc,m_src2->mem_opnd(),is_dbl);
        emitter.emit_fp_op(fmul_opc,1,true);
    }
    else if (opc == fdivr_opc)
    {
        //
        // we must make the x of x/y to be scaled down
        //
        emitter.emit_fld80(&M_Opnd((unsigned)dbl_scale_up));
        emitter.emit_fld80(&M_Opnd((unsigned)dbl_scale_down));
        emitter.emit_fp_op_mem(fmul_opc,m_src2->mem_opnd(),is_dbl);
        emitter.emit_fp_op_mem(fdiv_opc,m_src1->mem_opnd(),is_dbl);
        emitter.emit_fp_op(fmul_opc,1,true);
    }
    else
        assert(0); // not yet implemented
}

//
// loop:  st = st % st(1)
//        ax = fnstsw                  -- load FPU status word
//        if (ax & 0x0400) goto loop   -- check if remainder is completed
//
void fp_remainder_loop(Code_Emitter& emitter,
                       Stack&        stack) 
{
	stack.spill_opnds_contain(eax_reg);
    //
    // record offset for patching branch
    //
	int frem_off = emitter.get_offset();
    emitter.emit_frem();
    //
    // we need eax reg for fnstsw
    //
	Reg_Operand *reg = stack.reg_manager.get_reg(eax_reg);

    emitter.emit_fnstsw();
    emitter.emit_alu(and_opc,&reg->opnd,&Imm_Opnd(REMAINDER_BIT_POS));
	int disp = frem_off - emitter.get_offset();
	emitter.emit_branch(cc_ne,disp,0);
	reg->free_opnd(&stack.reg_manager);
}
//
//        fld  src1
//        fld  src2
// loop:  st = st % st(1)
//        ax = fnstsw                  -- load FPU status word
//        if (ax & 0x0400) goto loop   -- check if remainder is completed
//        fstp                         -- store src1%src2 result
//        fstp                         -- pop src1
//
void fp_remainder(Code_Emitter& emitter,
                  Stack& stack,
			      Pre_Alloc_Operand_Pool& op_pool,
                  Mem_Operand *m_src1,
                  Mem_Operand *m_src2,
                  bool is_dbl) 
{
	emitter.emit_fld(m_src1->mem_opnd(),is_dbl);
	emitter.emit_fld(m_src2->mem_opnd(),is_dbl);
	stack.spill_opnds_contain(eax_reg);
    //
    // record offset for patching branch
    //
	int frem_off = emitter.get_offset();
    emitter.emit_frem();
    //
    // we need eax reg for fnstsw
    //
	Reg_Operand *reg = stack.reg_manager.get_reg(eax_reg);

    emitter.emit_fnstsw();
    emitter.emit_alu(and_opc,&reg->opnd,&Imm_Opnd(REMAINDER_BIT_POS));
	int disp = frem_off - emitter.get_offset();
	emitter.emit_branch(cc_ne,disp,0);

    Stack_Operand *dst = NULL;
    if (is_dbl)
    {
        dst = op_pool.nth_stack(stack.depth()+1);
	    stack.push64(dst,op_pool.nth_stack(stack.depth()));
    }
    else
    {
	    dst = op_pool.nth_stack(stack.depth());
	    stack.push(dst);
    }
	emitter.emit_fst(dst->mem_opnd(),is_dbl,1);
    emitter.emit_fstp(0);
	reg->free_opnd(&stack.reg_manager);
}

//
// One simple optimization for floating-point is to keep temporary result
// on the fp stack.  We can only apply this optimization for fp non-strict
// mode because the fp stack is 80-bit which may not conform to Java FP spec.
//
void result_on_fp_stack(Mem_Manager& mem_manager, Stack& stack, bool is_dbl)
{
    if (!is_dbl) { // float
        Fp_Operand *fp_res = new(mem_manager)Fp_Operand(is_dbl);
        fp_res->fpstack_cnt = stack.fp_get_cnt();
        stack.push(fp_res);
    }
    else {  // double
        Fp_Operand *fp_res = new(mem_manager)Fp_Operand(is_dbl);            
        Fp_Operand *fp_res_hi = new(mem_manager)Fp_Operand(is_dbl);            
        fp_res->fpstack_cnt = stack.fp_get_cnt();
        fp_res_hi->fpstack_cnt = stack.fp_get_cnt();
        stack.push64(fp_res,fp_res_hi);
    }
}

//
// strict mode:
//    All temporary floating-point results are stored back to memory stack.
//    To follow Java FP spec, we have to do the following steps for every 
//    floating-point operation:
//    1. save fp control word (save old control word)
//    2. set the precision to be 64-bit
//    3. scale down because the size of exponent is wider than Java FP spec
//    4. perform operation
//    5. scale up 
//    7. restore fp control word (restore old control word)
//   
void gen_fp(Code_Emitter&           emitter,
            X86_FP_Opcode           opc,
            bool                    is_dbl,
            Stack&                  stack,
            Mem_Manager&            mem_manager,
            Frame&                  frame,
            Pre_Alloc_Operand_Pool& op_pool) 
{
    //
    // fld		src1
    // fp_op	src2
    // fst		dst
    //
    Operand *src1;
    Operand *src2;
    if (is_dbl) {
        Operand *src_hi;
        stack.pop64(src1,src_hi);
        stack.pop64(src2,src_hi);
    } else {
        src1 = stack.pop();
        src2 = stack.pop();
    }
    
    if (stack.fp_strict_mode)
    {
        assert(src1->is_mem());
        assert(src2->is_mem());
    
        src1->free_opnd(&stack.reg_manager);
        src2->free_opnd(&stack.reg_manager);
        Mem_Operand *m_src1 = (Mem_Operand*)src1;
        Mem_Operand *m_src2 = (Mem_Operand*)src2;
        //
        // FP_COMPATIBILITY
        //
        if (USE_FP_COMPATIBILITY) 
        {
            set_fpu_control_word(emitter, stack, op_pool, is_dbl);
            fp_strict_op(emitter, opc, m_src1, m_src2, is_dbl);
        } 
        else
        {
            emitter.emit_fld(m_src1->mem_opnd(),is_dbl);
            emitter.emit_fp_op_mem(opc,m_src2->mem_opnd(),is_dbl);
        }
        //
        // To do: make this be double when necessary
        //
        Stack_Operand *dst;
        if (is_dbl) {
            Stack_Operand *dst_hi = new(mem_manager) Stack_Operand(frame,stack.depth());
            dst = op_pool.nth_stack(stack.depth()+1);
            stack.push64(dst,dst_hi);
        } else {
            dst = op_pool.nth_stack(stack.depth());
            stack.push(dst);
        }
        //
        // FP_COMPATIBILITY
        //
        if (USE_FP_COMPATIBILITY)
            restore_old_fpu_control_word(emitter, stack, op_pool, is_dbl);

        emitter.emit_fst(dst->mem_opnd(),is_dbl,1);
    }
    else
    {
        load_onto_fp_stack(stack, src1, is_dbl);
        if (src2->kind == Operand::Fp) {
            emitter.emit_fp_op(opc, ((stack.fp_get_cnt()) - ((Fp_Operand*)src2)->fpstack_cnt));
            emitter.emit_fstp(((stack.fp_get_cnt()) - ((Fp_Operand*)src2)->fpstack_cnt));
            stack.fp_dec_cnt();
        } else {
            //
            // 32-bit floating point value may be in reg due to "dup"s bytecode
            //
            assert(src2->is_mem() || src2->is_reg());
            src2->free_opnd(&stack.reg_manager);
            if (src2->is_reg()) {
                Stack_Operand *src2_spill = op_pool.nth_stack(stack.depth()+1);
                src2->emit_mov_to_mem(emitter, src2_spill->mem_opnd());
                src2 = src2_spill;
            }
            emitter.emit_fp_op_mem(opc,((Mem_Operand*)src2)->mem_opnd(),is_dbl);
        }
        result_on_fp_stack(mem_manager, stack, is_dbl);
    }
}

//
// load src to the fp stack
//
void load_onto_fp_stack(Stack&  stack, 
                        Operand *src,
                        bool    is_dbl) {
    if (src->is_mem()) {
        src->free_opnd(&stack.reg_manager);
    
        if (stack.fp_check_stack(is_dbl, stack.op_pool, stack.emitter)) {
            stack.fp_inc_cnt();
        }
        ((Mem_Operand*)src)->emit_fld(stack.emitter,is_dbl);
    }
    else
        assert(src->kind == Operand::Fp);
}

//
// When we store a double var_i, the store actually takes two slots var_i and
// var_i+1. The memory locations of var_i and var_i+1 may not be contiguous.
// In the frame layout, there are incoming args (var_0, var_1, ...), return ip,
// vars (var_k, var_k+1, ...), spills, ... etc, where k is the number of 
// incoming arugments. Javac may decide to treat var_k-1, and var_k as a 
// double.  In the current fast code generation model, var_k is in the 
// vars area and var_k-1 is the incoming args.
// 
void store_double(Code_Emitter& emitter,
                  Frame&        frame,
                  Stack&        stack,
                  unsigned      index,
                  Operand       *src_lo,
                  Operand       *src_hi)
{
    if (stack.fp_strict_mode)
    {
        gen_store32(emitter,stack,&M_Var_Opnd(frame,index),src_hi);
        gen_store32(emitter,stack,&M_Var_Opnd(frame,index+1),src_lo);
    }
    else
    {
        if (src_lo->kind == Operand::Fp) {
            if (frame.contiguous_loc(index+1,index)) 
                gen_store32(emitter,stack,&M_Var_Opnd(frame,index+1),src_lo);
            else {
                //
                // we use stack spill area as the buffer to move the double value 
                // on the fp stack to var_index and var_index+1's locations
                //
                Operand *lo = stack.op_pool.nth_stack(stack.depth()+1);
                Operand *hi = stack.op_pool.nth_stack(stack.depth());
                gen_store32(emitter,stack,&M_Spill_Opnd(frame,stack.depth()+1),src_lo);
                gen_store32(emitter,stack,&M_Var_Opnd(frame,index+1),lo);
                gen_store32(emitter,stack,&M_Var_Opnd(frame,index),hi);
            }
        } else {
            gen_store32(emitter,stack,&M_Var_Opnd(frame,index),src_hi);
            gen_store32(emitter,stack,&M_Var_Opnd(frame,index+1),src_lo);
        }
    }
}


//
// see comments of store_double()
//
void load_double(Code_Emitter& emitter,
                 Frame&        frame,
                 Stack&        stack,
                 Mem_Manager&  mem,
                 unsigned      index)
{
    if (frame.contiguous_loc(index+1,index))
        stack.push64(new (mem) Mem_Var_Operand(frame,index+1,0),
                     new (mem) Mem_Var_Operand(frame,index,0));
    else {
        //
        // we move the value stored in var_index and var_index+1's locations which are
        // not contiguous in memory to the stack spill locations that are guaranteed to
        // be contiguous.
        //
        Mem_Var_Operand var_lo(frame,index+1,0);
        Mem_Var_Operand var_hi(frame,index,0);
        Operand *lo = &var_lo, *hi = &var_hi;
        gen_store32(emitter,stack,&M_Spill_Opnd(frame,stack.depth()), hi);
        gen_store32(emitter,stack,&M_Spill_Opnd(frame,stack.depth()+1), lo);
        stack.push64(stack.op_pool.nth_stack(stack.depth()+1),
                     stack.op_pool.nth_stack(stack.depth()));
    }
}
