/**************************************************************************
**
**  svd3
**
**  Quick singular value decomposition as described by:
**  A. McAdams, A. Selle, R. Tamstorf, J. Teran and E. Sifakis,
**  Computing the Singular Value Decomposition of 3x3 matrices
**  with minimal branching and elementary floating point operations,
**  University of Wisconsin - Madison technical report TR1690, May 2011
**
**	Identical GPU version
** 	Implementated by: Kui Wu
**	kwu@cs.utah.edu
**
**  May 2018
**
**************************************************************************/

#ifndef SVD3_CUDA_H
#define SVD3_CUDA_H

#include <cuda.h>
#include "math.h" // CUDA math library

#define gone					1065353216
#define gsine_pi_over_eight		1053028117
#define gcosine_pi_over_eight   1064076127
#define gone_half				0.5f
#define gsmall_number			1.e-12f
#define gtiny_number			1.e-20f
#define gfour_gamma_squared		5.8284273147583007813f

union un { float f; unsigned int ui; };

__device__ __forceinline__ void svd3(
	float a11, float a12, float a13,
  float a21, float a22, float a23,
  float a31, float a32, float a33, // input A
  float &u11, float &u12, float &u13,
  float &u21, float &u22, float &u23,
  float &u31, float &u32, float &u33,	// output U
	float &s11, float &s22, float &s33,	// output S
	float &v11, float &v12, float &v13,
  float &v21, float &v22, float &v23,
  float &v31, float &v32, float &v33 // output V
) {
	un Sa11, Sa21, Sa31, Sa12, Sa22, Sa32, Sa13, Sa23, Sa33;
	un Su11, Su21, Su31, Su12, Su22, Su32, Su13, Su23, Su33;
	un Sv11, Sv21, Sv31, Sv12, Sv22, Sv32, Sv13, Sv23, Sv33;
	un Sc, Ss, Sch, Ssh;
	un Stmp1, Stmp2, Stmp3, Stmp4, Stmp5;
	un Ss11, Ss21, Ss31, Ss22, Ss32, Ss33;
	un Sqvs, Sqvvx, Sqvvy, Sqvvz; 

	Sa11.f = a11; Sa12.f = a12; Sa13.f = a13;
	Sa21.f = a21; Sa22.f = a22; Sa23.f = a23;
	Sa31.f = a31; Sa32.f = a32; Sa33.f = a33;

	//###########################################################
	// Compute normal equations matrix
	//###########################################################

	Ss11.f = Sa11.f*Sa11.f;									
	Stmp1.f = Sa21.f*Sa21.f;								
	Ss11.f = __fadd_rn(Stmp1.f, Ss11.f);					
	Stmp1.f = Sa31.f*Sa31.f;								
	Ss11.f = __fadd_rn(Stmp1.f, Ss11.f);					

	Ss21.f = Sa12.f*Sa11.f;									
	Stmp1.f = Sa22.f*Sa21.f;								
	Ss21.f = __fadd_rn(Stmp1.f, Ss21.f);					
	Stmp1.f = Sa32.f*Sa31.f;								
	Ss21.f = __fadd_rn(Stmp1.f, Ss21.f);					

	Ss31.f = Sa13.f*Sa11.f;									
	Stmp1.f = Sa23.f*Sa21.f;								
	Ss31.f = __fadd_rn(Stmp1.f, Ss31.f);					
	Stmp1.f = Sa33.f*Sa31.f;								
	Ss31.f = __fadd_rn(Stmp1.f, Ss31.f);					

	Ss22.f = Sa12.f*Sa12.f;									
	Stmp1.f = Sa22.f*Sa22.f;								
	Ss22.f = __fadd_rn(Stmp1.f, Ss22.f);					
	Stmp1.f = Sa32.f*Sa32.f;								
	Ss22.f = __fadd_rn(Stmp1.f, Ss22.f);					

	Ss32.f = Sa13.f*Sa12.f;									
	Stmp1.f = Sa23.f*Sa22.f;								
	Ss32.f = __fadd_rn(Stmp1.f, Ss32.f);					
	Stmp1.f = Sa33.f*Sa32.f;								
	Ss32.f = __fadd_rn(Stmp1.f, Ss32.f);					

	Ss33.f = Sa13.f*Sa13.f;									
	Stmp1.f = Sa23.f*Sa23.f;								
	Ss33.f = __fadd_rn(Stmp1.f, Ss33.f);					
	Stmp1.f = Sa33.f*Sa33.f;								
	Ss33.f = __fadd_rn(Stmp1.f, Ss33.f);					

	Sqvs.f = 1.f; Sqvvx.f = 0.f; Sqvvy.f = 0.f; Sqvvz.f = 0.f;

	//###########################################################
	// Solve symmetric eigenproblem using Jacobi iteration
	//###########################################################
	for (int i = 0; i < 4; i++)
	{
		Ssh.f = Ss21.f * 0.5f;									
		Stmp5.f = __fsub_rn(Ss11.f, Ss22.f);					       
		
		Stmp2.f = Ssh.f*Ssh.f;                                         
		Stmp1.ui = (Stmp2.f >= gtiny_number) ? 0xffffffff : 0;	   
		Ssh.ui = Stmp1.ui&Ssh.ui;                                      
		Sch.ui = Stmp1.ui&Stmp5.ui;                            	       
		Stmp2.ui = ~Stmp1.ui&gone;                           	   
		Sch.ui = Sch.ui | Stmp2.ui;                            	       
		
		Stmp1.f = Ssh.f*Ssh.f;									       
		Stmp2.f = Sch.f*Sch.f;									       
		Stmp3.f = __fadd_rn(Stmp1.f, Stmp2.f);					       
		Stmp4.f = __frsqrt_rn(Stmp3.f);							       
		
		Ssh.f = Stmp4.f*Ssh.f;									       
		Sch.f = Stmp4.f*Sch.f;									       
		Stmp1.f = gfour_gamma_squared*Stmp1.f;					   
		Stmp1.ui = (Stmp2.f <= Stmp1.f) ? 0xffffffff : 0;		       
		
		Stmp2.ui = gsine_pi_over_eight&Stmp1.ui;				       
		Ssh.ui = ~Stmp1.ui&Ssh.ui;								       
		Ssh.ui = Ssh.ui | Stmp2.ui;								       
		Stmp2.ui = gcosine_pi_over_eight&Stmp1.ui;				   
		Sch.ui = ~Stmp1.ui&Sch.ui;								       
		Sch.ui = Sch.ui | Stmp2.ui;								       
		
		Stmp1.f = Ssh.f * Ssh.f;								       
		Stmp2.f = Sch.f * Sch.f;								
		Sc.f = __fsub_rn(Stmp2.f, Stmp1.f);						
		Ss.f = Sch.f * Ssh.f;									       
		Ss.f = __fadd_rn(Ss.f, Ss.f);							       

		//###########################################################
		// Perform the actual Givens conjugation
		//###########################################################

		Stmp3.f = __fadd_rn(Stmp1.f, Stmp2.f);							
		Ss33.f = Ss33.f * Stmp3.f;										
		Ss31.f = Ss31.f * Stmp3.f;										
		Ss32.f = Ss32.f * Stmp3.f;										
		Ss33.f = Ss33.f * Stmp3.f;										

		Stmp1.f = Ss.f * Ss31.f;										                                
		Stmp2.f = Ss.f * Ss32.f;										                                
		Ss31.f = Sc.f * Ss31.f;											                                
		Ss32.f = Sc.f * Ss32.f;											                                
		Ss31.f = __fadd_rn(Stmp2.f, Ss31.f);							                                
		Ss32.f = __fsub_rn(Ss32.f, Stmp1.f);							                                
		
		Stmp2.f = Ss.f*Ss.f;											                                
		Stmp1.f = Ss22.f*Stmp2.f;										                                
		Stmp3.f = Ss11.f*Stmp2.f;										                                
		Stmp4.f = Sc.f*Sc.f;											                                
		Ss11.f = Ss11.f*Stmp4.f;										                                
		Ss22.f = Ss22.f*Stmp4.f;										                                
		Ss11.f = __fadd_rn(Ss11.f, Stmp1.f);							                                
		Ss22.f = __fadd_rn(Ss22.f, Stmp3.f);							                                
		Stmp4.f = __fsub_rn(Stmp4.f, Stmp2.f);							                                
		Stmp2.f = __fadd_rn(Ss21.f, Ss21.f);							                                
		Ss21.f = Ss21.f*Stmp4.f;										                                
		Stmp4.f = Sc.f*Ss.f;											                                
		Stmp2.f = Stmp2.f*Stmp4.f;										                                
		Stmp5.f = Stmp5.f*Stmp4.f;										                                
		Ss11.f = __fadd_rn(Ss11.f, Stmp2.f);							                                
		Ss21.f = __fsub_rn(Ss21.f, Stmp5.f);							                                
		Ss22.f = __fsub_rn(Ss22.f, Stmp2.f);							                                

		//###########################################################
		// Compute the cumulative rotation, in quaternion form
		//###########################################################

		Stmp1.f = Ssh.f*Sqvvx.f;										                    
		Stmp2.f = Ssh.f*Sqvvy.f;										                    
		Stmp3.f = Ssh.f*Sqvvz.f;										                    
		Ssh.f = Ssh.f*Sqvs.f;											                    

		Sqvs.f = Sch.f*Sqvs.f;											                    
		Sqvvx.f = Sch.f*Sqvvx.f;										                        
		Sqvvy.f = Sch.f*Sqvvy.f;										                        
		Sqvvz.f = Sch.f*Sqvvz.f;										                        

		Sqvvz.f = __fadd_rn(Sqvvz.f, Ssh.f);							                                
		Sqvs.f = __fsub_rn(Sqvs.f, Stmp3.f);							                                
		Sqvvx.f = __fadd_rn(Sqvvx.f, Stmp2.f);							                                
		Sqvvy.f = __fsub_rn(Sqvvy.f, Stmp1.f);							                            

		//////////////////////////////////////////////////////////////////////////
		// (1->3)
		//////////////////////////////////////////////////////////////////////////
		Ssh.f = Ss32.f * 0.5f;									 
		Stmp5.f = __fsub_rn(Ss22.f, Ss33.f);					                         
		
		Stmp2.f = Ssh.f * Ssh.f;                                         
		Stmp1.ui = (Stmp2.f >= gtiny_number) ? 0xffffffff : 0;	     
		Ssh.ui = Stmp1.ui&Ssh.ui;                                        
		Sch.ui = Stmp1.ui&Stmp5.ui;                            	     
		Stmp2.ui = ~Stmp1.ui&gone;                           	     
		Sch.ui = Sch.ui | Stmp2.ui;                            	     
		
		Stmp1.f = Ssh.f * Ssh.f;								             
		Stmp2.f = Sch.f * Sch.f;								             
		Stmp3.f = __fadd_rn(Stmp1.f, Stmp2.f);					                 
		Stmp4.f = __frsqrt_rn(Stmp3.f);							             
		
		Ssh.f = Stmp4.f * Ssh.f;								             
		Sch.f = Stmp4.f * Sch.f;								             
		Stmp1.f = gfour_gamma_squared * Stmp1.f;				         
		Stmp1.ui = (Stmp2.f <= Stmp1.f) ? 0xffffffff : 0;		         
		
		Stmp2.ui = gsine_pi_over_eight&Stmp1.ui;				     
		Ssh.ui = ~Stmp1.ui&Ssh.ui;								         
		Ssh.ui = Ssh.ui | Stmp2.ui;								     
		Stmp2.ui = gcosine_pi_over_eight&Stmp1.ui;				     
		Sch.ui = ~Stmp1.ui&Sch.ui;								         
		Sch.ui = Sch.ui | Stmp2.ui;								     
		
		Stmp1.f = Ssh.f * Ssh.f;								             
		Stmp2.f = Sch.f * Sch.f;								             
		Sc.f = __fsub_rn(Stmp2.f, Stmp1.f);						     
		Ss.f = Sch.f*Ssh.f;										     
		Ss.f = __fadd_rn(Ss.f, Ss.f);							                 

		//###########################################################
		// Perform the actual Givens conjugation
		//###########################################################

		Stmp3.f = __fadd_rn(Stmp1.f, Stmp2.f);						
		Ss11.f = Ss11.f * Stmp3.f;									
		Ss21.f = Ss21.f * Stmp3.f;									
		Ss31.f = Ss31.f * Stmp3.f;									
		Ss11.f = Ss11.f * Stmp3.f;									
		
		Stmp1.f = Ss.f*Ss21.f;										              
		Stmp2.f = Ss.f*Ss31.f;										              
		Ss21.f = Sc.f*Ss21.f;										                  
		Ss31.f = Sc.f*Ss31.f;										                  
		Ss21.f = __fadd_rn(Stmp2.f, Ss21.f);						                              
		Ss31.f = __fsub_rn(Ss31.f, Stmp1.f);						                              
		
		Stmp2.f = Ss.f*Ss.f;										              
		Stmp1.f = Ss33.f*Stmp2.f;									              
		Stmp3.f = Ss22.f*Stmp2.f;									              
		Stmp4.f = Sc.f * Sc.f;										              
		Ss22.f = Ss22.f * Stmp4.f;									                  
		Ss33.f = Ss33.f * Stmp4.f;									                  
		Ss22.f = __fadd_rn(Ss22.f, Stmp1.f);						                              
		Ss33.f = __fadd_rn(Ss33.f, Stmp3.f);						                              
		Stmp4.f = __fsub_rn(Stmp4.f, Stmp2.f);						                      
		Stmp2.f = __fadd_rn(Ss32.f, Ss32.f);						                              
		Ss32.f = Ss32.f*Stmp4.f;									                  
		Stmp4.f = Sc.f*Ss.f;										              
		Stmp2.f = Stmp2.f*Stmp4.f;									              
		Stmp5.f = Stmp5.f*Stmp4.f;									              
		Ss22.f = __fadd_rn(Ss22.f, Stmp2.f);						                              
		Ss32.f = __fsub_rn(Ss32.f, Stmp5.f);						                  
		Ss33.f = __fsub_rn(Ss33.f, Stmp2.f);						                  

		//###########################################################
		// Compute the cumulative rotation, in quaternion form
		//###########################################################

		Stmp1.f = Ssh.f*Sqvvx.f;										                      
		Stmp2.f = Ssh.f*Sqvvy.f;										                      
		Stmp3.f = Ssh.f*Sqvvz.f;										                      
		Ssh.f = Ssh.f*Sqvs.f;											                      

		Sqvs.f = Sch.f*Sqvs.f;											                      
		Sqvvx.f = Sch.f*Sqvvx.f;										                          
		Sqvvy.f = Sch.f*Sqvvy.f;										                          
		Sqvvz.f = Sch.f*Sqvvz.f;										                          

		Sqvvx.f = __fadd_rn(Sqvvx.f, Ssh.f);							                                  
		Sqvs.f = __fsub_rn(Sqvs.f, Stmp1.f);							                                  
		Sqvvy.f = __fadd_rn(Sqvvy.f, Stmp3.f);							                                  
		Sqvvz.f = __fsub_rn(Sqvvz.f, Stmp2.f);							 

#if 1
		//////////////////////////////////////////////////////////////////////////
		// 1 -> 2
		//////////////////////////////////////////////////////////////////////////

		Ssh.f = Ss31.f * 0.5f;									  
		Stmp5.f = __fsub_rn(Ss33.f, Ss11.f);					                          
		
		Stmp2.f = Ssh.f*Ssh.f;                                            
		Stmp1.ui = (Stmp2.f >= gtiny_number) ? 0xffffffff : 0;	      
		Ssh.ui = Stmp1.ui&Ssh.ui;                                         
		Sch.ui = Stmp1.ui&Stmp5.ui;                            	      
		Stmp2.ui = ~Stmp1.ui&gone;                           	      
		Sch.ui = Sch.ui | Stmp2.ui;                            	      
		
		Stmp1.f = Ssh.f*Ssh.f;									          
		Stmp2.f = Sch.f*Sch.f;									          
		Stmp3.f = __fadd_rn(Stmp1.f, Stmp2.f);					                      
		Stmp4.f = __frsqrt_rn(Stmp3.f);							              
		
		Ssh.f = Stmp4.f*Ssh.f;									          
		Sch.f = Stmp4.f*Sch.f;									          
		Stmp1.f = gfour_gamma_squared*Stmp1.f;					      
		Stmp1.ui = (Stmp2.f <= Stmp1.f) ? 0xffffffff : 0;		          
		
		Stmp2.ui = gsine_pi_over_eight&Stmp1.ui;				      
		Ssh.ui = ~Stmp1.ui&Ssh.ui;								          
		Ssh.ui = Ssh.ui | Stmp2.ui;								      
		Stmp2.ui = gcosine_pi_over_eight&Stmp1.ui;				      
		Sch.ui = ~Stmp1.ui&Sch.ui;								          
		Sch.ui = Sch.ui | Stmp2.ui;								      
		
		Stmp1.f = Ssh.f*Ssh.f;									          
		Stmp2.f = Sch.f*Sch.f;									          
		Sc.f = __fsub_rn(Stmp2.f, Stmp1.f);						      
		Ss.f = Sch.f*Ssh.f;										      
		Ss.f = __fadd_rn(Ss.f, Ss.f);							                  

		//###########################################################
		// Perform the actual Givens conjugation
		//###########################################################

		Stmp3.f = __fadd_rn(Stmp1.f, Stmp2.f);							
		Ss22.f = Ss22.f * Stmp3.f;										
		Ss32.f = Ss32.f * Stmp3.f;										
		Ss21.f = Ss21.f * Stmp3.f;										
		Ss22.f = Ss22.f * Stmp3.f;										
		
		Stmp1.f = Ss.f*Ss32.f;											                
		Stmp2.f = Ss.f*Ss21.f;											                
		Ss32.f = Sc.f*Ss32.f;											                    
		Ss21.f = Sc.f*Ss21.f;											                    
		Ss32.f = __fadd_rn(Stmp2.f, Ss32.f);							                                
		Ss21.f = __fsub_rn(Ss21.f, Stmp1.f);							                                
		
		Stmp2.f = Ss.f*Ss.f;											                
		Stmp1.f = Ss11.f*Stmp2.f;										                
		Stmp3.f = Ss33.f*Stmp2.f;										                
		Stmp4.f = Sc.f*Sc.f;											                
		Ss33.f = Ss33.f*Stmp4.f;										                    
		Ss11.f = Ss11.f*Stmp4.f;										                    
		Ss33.f = __fadd_rn(Ss33.f, Stmp1.f);							                                
		Ss11.f = __fadd_rn(Ss11.f, Stmp3.f);							                                
		Stmp4.f = __fsub_rn(Stmp4.f, Stmp2.f);							                        
		Stmp2.f = __fadd_rn(Ss31.f, Ss31.f);							                                
		Ss31.f = Ss31.f*Stmp4.f;										                    
		Stmp4.f = Sc.f*Ss.f;											                
		Stmp2.f = Stmp2.f*Stmp4.f;										                
		Stmp5.f = Stmp5.f*Stmp4.f;										                
		Ss33.f = __fadd_rn(Ss33.f, Stmp2.f);							                                
		Ss31.f = __fsub_rn(Ss31.f, Stmp5.f);							                                
		Ss11.f = __fsub_rn(Ss11.f, Stmp2.f);							                                

		//###########################################################
		// Compute the cumulative rotation, in quaternion form
		//###########################################################

		Stmp1.f = Ssh.f*Sqvvx.f;										                        
		Stmp2.f = Ssh.f*Sqvvy.f;										                        
		Stmp3.f = Ssh.f*Sqvvz.f;										                        
		Ssh.f = Ssh.f*Sqvs.f;											                        

		Sqvs.f = Sch.f*Sqvs.f;											                        
		Sqvvx.f = Sch.f*Sqvvx.f;										                            
		Sqvvy.f = Sch.f*Sqvvy.f;										                            
		Sqvvz.f = Sch.f*Sqvvz.f;										                            

		Sqvvy.f = __fadd_rn(Sqvvy.f, Ssh.f);							                                    
		Sqvs.f = __fsub_rn(Sqvs.f, Stmp2.f);							                        
		Sqvvz.f = __fadd_rn(Sqvvz.f, Stmp1.f);							                                    
		Sqvvx.f = __fsub_rn(Sqvvx.f, Stmp3.f);							
#endif
	}

	//###########################################################
	// Normalize quaternion for matrix V
	//###########################################################

	Stmp2.f = Sqvs.f*Sqvs.f;
	Stmp1.f = Sqvvx.f*Sqvvx.f;
	Stmp2.f = __fadd_rn(Stmp1.f, Stmp2.f);
	Stmp1.f = Sqvvy.f*Sqvvy.f; 
	Stmp2.f = __fadd_rn(Stmp1.f, Stmp2.f);
	Stmp1.f = Sqvvz.f*Sqvvz.f; 
	Stmp2.f = __fadd_rn(Stmp1.f, Stmp2.f);

	Stmp1.f = __frsqrt_rn(Stmp2.f);
	Stmp4.f = Stmp1.f*0.5f;
	Stmp3.f = Stmp1.f*Stmp4.f;
	Stmp3.f = Stmp1.f*Stmp3.f;
	Stmp3.f = Stmp2.f*Stmp3.f;
	Stmp1.f = __fadd_rn(Stmp1.f, Stmp4.f);
	Stmp1.f = __fsub_rn(Stmp1.f, Stmp3.f);

	Sqvs.f = Sqvs.f*Stmp1.f;
	Sqvvx.f = Sqvvx.f*Stmp1.f;
	Sqvvy.f = Sqvvy.f*Stmp1.f;
	Sqvvz.f = Sqvvz.f*Stmp1.f;

	//###########################################################
	// Transform quaternion to matrix V
	//###########################################################

	Stmp1.f = Sqvvx.f*Sqvvx.f;
	Stmp2.f = Sqvvy.f*Sqvvy.f;
	Stmp3.f = Sqvvz.f*Sqvvz.f;
	Sv11.f = Sqvs.f*Sqvs.f;
	Sv22.f = __fsub_rn(Sv11.f, Stmp1.f);
	Sv33.f = __fsub_rn(Sv22.f, Stmp2.f);
	Sv33.f = __fadd_rn(Sv33.f, Stmp3.f);
	Sv22.f = __fadd_rn(Sv22.f, Stmp2.f);
	Sv22.f = __fsub_rn(Sv22.f, Stmp3.f);
	Sv11.f = __fadd_rn(Sv11.f, Stmp1.f);
	Sv11.f = __fsub_rn(Sv11.f, Stmp2.f);
	Sv11.f = __fsub_rn(Sv11.f, Stmp3.f);
	Stmp1.f = __fadd_rn(Sqvvx.f, Sqvvx.f);
	Stmp2.f = __fadd_rn(Sqvvy.f, Sqvvy.f);
	Stmp3.f = __fadd_rn(Sqvvz.f, Sqvvz.f);
	Sv32.f = Sqvs.f*Stmp1.f;
	Sv13.f = Sqvs.f*Stmp2.f;
	Sv21.f = Sqvs.f*Stmp3.f;
	Stmp1.f = Sqvvy.f*Stmp1.f;
	Stmp2.f = Sqvvz.f*Stmp2.f;
	Stmp3.f = Sqvvx.f*Stmp3.f;
	Sv12.f = __fsub_rn(Stmp1.f, Sv21.f);
	Sv23.f = __fsub_rn(Stmp2.f, Sv32.f);
	Sv31.f = __fsub_rn(Stmp3.f, Sv13.f);
	Sv21.f = __fadd_rn(Stmp1.f, Sv21.f);
	Sv32.f = __fadd_rn(Stmp2.f, Sv32.f);
	Sv13.f = __fadd_rn(Stmp3.f, Sv13.f);

	///###########################################################
	// Multiply (from the right) with V
	//###########################################################

	Stmp2.f = Sa12.f;
	Stmp3.f = Sa13.f;
	Sa12.f = Sv12.f*Sa11.f;
	Sa13.f = Sv13.f*Sa11.f;
	Sa11.f = Sv11.f*Sa11.f;
	Stmp1.f = Sv21.f*Stmp2.f;
	Sa11.f = __fadd_rn(Sa11.f, Stmp1.f);
	Stmp1.f = Sv31.f*Stmp3.f;
	Sa11.f = __fadd_rn(Sa11.f, Stmp1.f);
	Stmp1.f = Sv22.f*Stmp2.f;
	Sa12.f = __fadd_rn(Sa12.f, Stmp1.f);
	Stmp1.f = Sv32.f*Stmp3.f;
	Sa12.f = __fadd_rn(Sa12.f, Stmp1.f);
	Stmp1.f = Sv23.f*Stmp2.f;
	Sa13.f = __fadd_rn(Sa13.f, Stmp1.f);
	Stmp1.f = Sv33.f*Stmp3.f;
	Sa13.f = __fadd_rn(Sa13.f, Stmp1.f);

	Stmp2.f = Sa22.f;
	Stmp3.f = Sa23.f;
	Sa22.f = Sv12.f*Sa21.f;
	Sa23.f = Sv13.f*Sa21.f;
	Sa21.f = Sv11.f*Sa21.f;
	Stmp1.f = Sv21.f*Stmp2.f;
	Sa21.f = __fadd_rn(Sa21.f, Stmp1.f);
	Stmp1.f = Sv31.f*Stmp3.f;
	Sa21.f = __fadd_rn(Sa21.f, Stmp1.f);
	Stmp1.f = Sv22.f*Stmp2.f;
	Sa22.f = __fadd_rn(Sa22.f, Stmp1.f);
	Stmp1.f = Sv32.f*Stmp3.f;
	Sa22.f = __fadd_rn(Sa22.f, Stmp1.f);
	Stmp1.f = Sv23.f*Stmp2.f;
	Sa23.f = __fadd_rn(Sa23.f, Stmp1.f);
	Stmp1.f = Sv33.f*Stmp3.f;
	Sa23.f = __fadd_rn(Sa23.f, Stmp1.f);

	Stmp2.f = Sa32.f;
	Stmp3.f = Sa33.f;
	Sa32.f = Sv12.f*Sa31.f;
	Sa33.f = Sv13.f*Sa31.f;
	Sa31.f = Sv11.f*Sa31.f;
	Stmp1.f = Sv21.f*Stmp2.f;
	Sa31.f = __fadd_rn(Sa31.f, Stmp1.f);
	Stmp1.f = Sv31.f*Stmp3.f;
	Sa31.f = __fadd_rn(Sa31.f, Stmp1.f);
	Stmp1.f = Sv22.f*Stmp2.f;
	Sa32.f = __fadd_rn(Sa32.f, Stmp1.f);
	Stmp1.f = Sv32.f*Stmp3.f;
	Sa32.f = __fadd_rn(Sa32.f, Stmp1.f);
	Stmp1.f = Sv23.f*Stmp2.f;
	Sa33.f = __fadd_rn(Sa33.f, Stmp1.f);
	Stmp1.f = Sv33.f*Stmp3.f;
	Sa33.f = __fadd_rn(Sa33.f, Stmp1.f);

	//###########################################################
	// Permute columns such that the singular values are sorted
	//###########################################################

	Stmp1.f = Sa11.f*Sa11.f;								
	Stmp4.f = Sa21.f*Sa21.f;								
	Stmp1.f = __fadd_rn(Stmp1.f, Stmp4.f);					
	Stmp4.f = Sa31.f*Sa31.f;								
	Stmp1.f = __fadd_rn(Stmp1.f, Stmp4.f);					

	Stmp2.f = Sa12.f*Sa12.f;								
	Stmp4.f = Sa22.f*Sa22.f;								
	Stmp2.f = __fadd_rn(Stmp2.f, Stmp4.f);					
	Stmp4.f = Sa32.f*Sa32.f;								
	Stmp2.f = __fadd_rn(Stmp2.f, Stmp4.f);					

	Stmp3.f = Sa13.f*Sa13.f;								
	Stmp4.f = Sa23.f*Sa23.f;								
	Stmp3.f = __fadd_rn(Stmp3.f, Stmp4.f);					
	Stmp4.f = Sa33.f*Sa33.f;								
	Stmp3.f = __fadd_rn(Stmp3.f, Stmp4.f);					

	// Swap columns 1-2 if necessary

	Stmp4.ui = (Stmp1.f < Stmp2.f) ? 0xffffffff : 0;	
	Stmp5.ui = Sa11.ui^Sa12.ui;								
	Stmp5.ui = Stmp5.ui&Stmp4.ui;							
	Sa11.ui = Sa11.ui^Stmp5.ui;								
	Sa12.ui = Sa12.ui^Stmp5.ui;								

	Stmp5.ui = Sa21.ui^Sa22.ui;								
	Stmp5.ui = Stmp5.ui&Stmp4.ui;							
	Sa21.ui = Sa21.ui^Stmp5.ui;								
	Sa22.ui = Sa22.ui^Stmp5.ui;								

	Stmp5.ui = Sa31.ui^Sa32.ui;								
	Stmp5.ui = Stmp5.ui&Stmp4.ui;							
	Sa31.ui = Sa31.ui^Stmp5.ui;								
	Sa32.ui = Sa32.ui^Stmp5.ui;								

	Stmp5.ui = Sv11.ui^Sv12.ui;								
	Stmp5.ui = Stmp5.ui&Stmp4.ui;							
	Sv11.ui = Sv11.ui^Stmp5.ui;								
	Sv12.ui = Sv12.ui^Stmp5.ui;								

	Stmp5.ui = Sv21.ui^Sv22.ui;								
	Stmp5.ui = Stmp5.ui&Stmp4.ui;							
	Sv21.ui = Sv21.ui^Stmp5.ui;								
	Sv22.ui = Sv22.ui^Stmp5.ui;								

	Stmp5.ui = Sv31.ui^Sv32.ui;								
	Stmp5.ui = Stmp5.ui&Stmp4.ui;							
	Sv31.ui = Sv31.ui^Stmp5.ui;								
	Sv32.ui = Sv32.ui^Stmp5.ui;								

	Stmp5.ui = Stmp1.ui^Stmp2.ui;							
	Stmp5.ui = Stmp5.ui&Stmp4.ui;							
	Stmp1.ui = Stmp1.ui^Stmp5.ui;							
	Stmp2.ui = Stmp2.ui^Stmp5.ui;							

	// If columns 1-2 have been swapped, negate 2nd column of A and V so that V is still a rotation

	Stmp5.f = -2.f;											
	Stmp5.ui = Stmp5.ui&Stmp4.ui;							
	Stmp4.f = 1.f;											
	Stmp4.f = __fadd_rn(Stmp4.f, Stmp5.f);					

	Sa12.f = Sa12.f*Stmp4.f;								
	Sa22.f = Sa22.f*Stmp4.f;								
	Sa32.f = Sa32.f*Stmp4.f;								

	Sv12.f = Sv12.f*Stmp4.f;								
	Sv22.f = Sv22.f*Stmp4.f;								
	Sv32.f = Sv32.f*Stmp4.f;								

	// Swap columns 1-3 if necessary

	Stmp4.ui = (Stmp1.f < Stmp3.f) ? 0xffffffff : 0;		
	Stmp5.ui = Sa11.ui^Sa13.ui;								
	Stmp5.ui = Stmp5.ui&Stmp4.ui;							
	Sa11.ui = Sa11.ui^Stmp5.ui;								
	Sa13.ui = Sa13.ui^Stmp5.ui;								

	Stmp5.ui = Sa21.ui^Sa23.ui;								
	Stmp5.ui = Stmp5.ui&Stmp4.ui;							
	Sa21.ui = Sa21.ui^Stmp5.ui;								
	Sa23.ui = Sa23.ui^Stmp5.ui;								

	Stmp5.ui = Sa31.ui^Sa33.ui;								
	Stmp5.ui = Stmp5.ui&Stmp4.ui;							
	Sa31.ui = Sa31.ui^Stmp5.ui;								
	Sa33.ui = Sa33.ui^Stmp5.ui;								

	Stmp5.ui = Sv11.ui^Sv13.ui;								
	Stmp5.ui = Stmp5.ui&Stmp4.ui;							
	Sv11.ui = Sv11.ui^Stmp5.ui;								
	Sv13.ui = Sv13.ui^Stmp5.ui;								

	Stmp5.ui = Sv21.ui^Sv23.ui;								
	Stmp5.ui = Stmp5.ui&Stmp4.ui;							
	Sv21.ui = Sv21.ui^Stmp5.ui;								
	Sv23.ui = Sv23.ui^Stmp5.ui;								

	Stmp5.ui = Sv31.ui^Sv33.ui;								
	Stmp5.ui = Stmp5.ui&Stmp4.ui;							
	Sv31.ui = Sv31.ui^Stmp5.ui;								
	Sv33.ui = Sv33.ui^Stmp5.ui;								

	Stmp5.ui = Stmp1.ui^Stmp3.ui;							
	Stmp5.ui = Stmp5.ui&Stmp4.ui;							
	Stmp1.ui = Stmp1.ui^Stmp5.ui;							
	Stmp3.ui = Stmp3.ui^Stmp5.ui;							

	// If columns 1-3 have been swapped, negate 1st column of A and V so that V is still a rotation

	Stmp5.f = -2.f;											
	Stmp5.ui = Stmp5.ui&Stmp4.ui;							
	Stmp4.f = 1.f;											
	Stmp4.f = __fadd_rn(Stmp4.f, Stmp5.f);					

	Sa11.f = Sa11.f*Stmp4.f;								
	Sa21.f = Sa21.f*Stmp4.f;								
	Sa31.f = Sa31.f*Stmp4.f;								

	Sv11.f = Sv11.f*Stmp4.f;								
	Sv21.f = Sv21.f*Stmp4.f;								
	Sv31.f = Sv31.f*Stmp4.f;								

	// Swap columns 2-3 if necessary

	Stmp4.ui = (Stmp2.f < Stmp3.f) ? 0xffffffff : 0;		
	Stmp5.ui = Sa12.ui^Sa13.ui;								
	Stmp5.ui = Stmp5.ui&Stmp4.ui;							
	Sa12.ui = Sa12.ui^Stmp5.ui;								
	Sa13.ui = Sa13.ui^Stmp5.ui;								

	Stmp5.ui = Sa22.ui^Sa23.ui;								
	Stmp5.ui = Stmp5.ui&Stmp4.ui;							
	Sa22.ui = Sa22.ui^Stmp5.ui;								
	Sa23.ui = Sa23.ui^Stmp5.ui;								

	Stmp5.ui = Sa32.ui^Sa33.ui;								
	Stmp5.ui = Stmp5.ui&Stmp4.ui;							
	Sa32.ui = Sa32.ui^Stmp5.ui;								
	Sa33.ui = Sa33.ui^Stmp5.ui;								

	Stmp5.ui = Sv12.ui^Sv13.ui;								
	Stmp5.ui = Stmp5.ui&Stmp4.ui;							
	Sv12.ui = Sv12.ui^Stmp5.ui;								
	Sv13.ui = Sv13.ui^Stmp5.ui;								

	Stmp5.ui = Sv22.ui^Sv23.ui;								
	Stmp5.ui = Stmp5.ui&Stmp4.ui;							
	Sv22.ui = Sv22.ui^Stmp5.ui;								
	Sv23.ui = Sv23.ui^Stmp5.ui;								

	Stmp5.ui = Sv32.ui^Sv33.ui;								
	Stmp5.ui = Stmp5.ui&Stmp4.ui;							
	Sv32.ui = Sv32.ui^Stmp5.ui;								
	Sv33.ui = Sv33.ui^Stmp5.ui;								

	Stmp5.ui = Stmp2.ui^Stmp3.ui;							
	Stmp5.ui = Stmp5.ui&Stmp4.ui;							
	Stmp2.ui = Stmp2.ui^Stmp5.ui;							
	Stmp3.ui = Stmp3.ui^Stmp5.ui;							

	// If columns 2-3 have been swapped, negate 3rd column of A and V so that V is still a rotation

	Stmp5.f = -2.f;											
	Stmp5.ui = Stmp5.ui&Stmp4.ui;							
	Stmp4.f = 1.f;											
	Stmp4.f = __fadd_rn(Stmp4.f, Stmp5.f);					

	Sa13.f = Sa13.f*Stmp4.f;								
	Sa23.f = Sa23.f*Stmp4.f;								
	Sa33.f = Sa33.f*Stmp4.f;								

	Sv13.f = Sv13.f*Stmp4.f;								
	Sv23.f = Sv23.f*Stmp4.f;								
	Sv33.f = Sv33.f*Stmp4.f;								

	//###########################################################
	// Construct QR factorization of A*V (=U*D) using Givens rotations
	//###########################################################

	Su11.f = 1.f; Su12.f = 0.f; Su13.f = 0.f;
	Su21.f = 0.f; Su22.f = 1.f; Su23.f = 0.f;
	Su31.f = 0.f; Su32.f = 0.f; Su33.f = 1.f;
	
	Ssh.f = Sa21.f*Sa21.f;								
	Ssh.ui = (Ssh.f >= gsmall_number) ? 0xffffffff : 0;	
	Ssh.ui = Ssh.ui&Sa21.ui;							

	Stmp5.f = 0.f;										
	Sch.f = __fsub_rn(Stmp5.f, Sa11.f);					
	Sch.f = max(Sch.f, Sa11.f);							
	Sch.f = max(Sch.f, gsmall_number);					
	Stmp5.ui = (Sa11.f >= Stmp5.f) ? 0xffffffff : 0;	

	Stmp1.f = Sch.f*Sch.f;								
	Stmp2.f = Ssh.f*Ssh.f;								
	Stmp2.f = __fadd_rn(Stmp1.f, Stmp2.f);				
	Stmp1.f = __frsqrt_rn(Stmp2.f);						

	Stmp4.f = Stmp1.f*0.5f;							
	Stmp3.f = Stmp1.f*Stmp4.f;						
	Stmp3.f = Stmp1.f*Stmp3.f;						
	Stmp3.f = Stmp2.f*Stmp3.f;						
	Stmp1.f = __fadd_rn(Stmp1.f, Stmp4.f);			
	Stmp1.f = __fsub_rn(Stmp1.f, Stmp3.f);			
	Stmp1.f = Stmp1.f*Stmp2.f;						

	Sch.f = __fadd_rn(Sch.f, Stmp1.f);				

	Stmp1.ui = ~Stmp5.ui&Ssh.ui;					
	Stmp2.ui = ~Stmp5.ui&Sch.ui;					
	Sch.ui = Stmp5.ui&Sch.ui;						
	Ssh.ui = Stmp5.ui&Ssh.ui;						
	Sch.ui = Sch.ui | Stmp1.ui;						
	Ssh.ui = Ssh.ui | Stmp2.ui;						

	Stmp1.f = Sch.f*Sch.f;							
	Stmp2.f = Ssh.f*Ssh.f;							
	Stmp2.f = __fadd_rn(Stmp1.f, Stmp2.f);			
	Stmp1.f = __frsqrt_rn(Stmp2.f);					

	Stmp4.f = Stmp1.f*0.5f;							
	Stmp3.f = Stmp1.f*Stmp4.f;						
	Stmp3.f = Stmp1.f*Stmp3.f;						
	Stmp3.f = Stmp2.f*Stmp3.f;						
	Stmp1.f = __fadd_rn(Stmp1.f, Stmp4.f);			
	Stmp1.f = __fsub_rn(Stmp1.f, Stmp3.f);			

	Sch.f = Sch.f*Stmp1.f;							
	Ssh.f = Ssh.f*Stmp1.f;							

	Sc.f = Sch.f*Sch.f;								
	Ss.f = Ssh.f*Ssh.f;								
	Sc.f = __fsub_rn(Sc.f, Ss.f);					
	Ss.f = Ssh.f*Sch.f;								
	Ss.f = __fadd_rn(Ss.f, Ss.f);					

	//###########################################################
	// Rotate matrix A
	//###########################################################

	Stmp1.f = Ss.f*Sa11.f;									
	Stmp2.f = Ss.f*Sa21.f;									
	Sa11.f = Sc.f*Sa11.f;									
	Sa21.f = Sc.f*Sa21.f;									
	Sa11.f = __fadd_rn(Sa11.f, Stmp2.f);					
	Sa21.f = __fsub_rn(Sa21.f, Stmp1.f);					

	Stmp1.f = Ss.f*Sa12.f;									
	Stmp2.f = Ss.f*Sa22.f;									
	Sa12.f = Sc.f*Sa12.f;									
	Sa22.f = Sc.f*Sa22.f;									
	Sa12.f = __fadd_rn(Sa12.f, Stmp2.f);					
	Sa22.f = __fsub_rn(Sa22.f, Stmp1.f);					

	Stmp1.f = Ss.f*Sa13.f;									
	Stmp2.f = Ss.f*Sa23.f;									
	Sa13.f = Sc.f*Sa13.f;									
	Sa23.f = Sc.f*Sa23.f;									
	Sa13.f = __fadd_rn(Sa13.f, Stmp2.f);					
	Sa23.f = __fsub_rn(Sa23.f, Stmp1.f);					

	//###########################################################
	// Update matrix U
	//###########################################################

	Stmp1.f = Ss.f*Su11.f;
	Stmp2.f = Ss.f*Su12.f;
	Su11.f = Sc.f*Su11.f;
	Su12.f = Sc.f*Su12.f;
	Su11.f = __fadd_rn(Su11.f, Stmp2.f);
	Su12.f = __fsub_rn(Su12.f, Stmp1.f);

	Stmp1.f = Ss.f*Su21.f;
	Stmp2.f = Ss.f*Su22.f;
	Su21.f = Sc.f*Su21.f;
	Su22.f = Sc.f*Su22.f;
	Su21.f = __fadd_rn(Su21.f, Stmp2.f);
	Su22.f = __fsub_rn(Su22.f, Stmp1.f);

	Stmp1.f = Ss.f*Su31.f;								
	Stmp2.f = Ss.f*Su32.f;								
	Su31.f = Sc.f*Su31.f;
	Su32.f = Sc.f*Su32.f;
	Su31.f = __fadd_rn(Su31.f, Stmp2.f);
	Su32.f = __fsub_rn(Su32.f, Stmp1.f);

	// Second Givens rotation

	Ssh.f = Sa31.f*Sa31.f;								
	Ssh.ui = (Ssh.f >= gsmall_number) ? 0xffffffff : 0;	
	Ssh.ui = Ssh.ui&Sa31.ui;							

	Stmp5.f = 0.f;										
	Sch.f = __fsub_rn(Stmp5.f, Sa11.f);					
	Sch.f = max(Sch.f, Sa11.f);							
	Sch.f = max(Sch.f, gsmall_number);					
	Stmp5.ui = (Sa11.f >= Stmp5.f) ? 0xffffffff : 0;	

	Stmp1.f = Sch.f*Sch.f;								
	Stmp2.f = Ssh.f*Ssh.f;								
	Stmp2.f = __fadd_rn(Stmp1.f, Stmp2.f);				
	Stmp1.f = __frsqrt_rn(Stmp2.f);						

	Stmp4.f = Stmp1.f*0.5;							
	Stmp3.f = Stmp1.f*Stmp4.f;						
	Stmp3.f = Stmp1.f*Stmp3.f;						
	Stmp3.f = Stmp2.f*Stmp3.f;						
	Stmp1.f = __fadd_rn(Stmp1.f, Stmp4.f);			
	Stmp1.f = __fsub_rn(Stmp1.f, Stmp3.f);			
	Stmp1.f = Stmp1.f*Stmp2.f;						

	Sch.f = __fadd_rn(Sch.f, Stmp1.f);				

	Stmp1.ui = ~Stmp5.ui&Ssh.ui;					
	Stmp2.ui = ~Stmp5.ui&Sch.ui;					
	Sch.ui = Stmp5.ui&Sch.ui;						
	Ssh.ui = Stmp5.ui&Ssh.ui;						
	Sch.ui = Sch.ui | Stmp1.ui;						
	Ssh.ui = Ssh.ui | Stmp2.ui;						

	Stmp1.f = Sch.f*Sch.f;							
	Stmp2.f = Ssh.f*Ssh.f;							
	Stmp2.f = __fadd_rn(Stmp1.f, Stmp2.f);			
	Stmp1.f = __frsqrt_rn(Stmp2.f);					

	Stmp4.f = Stmp1.f*0.5f;									
	Stmp3.f = Stmp1.f*Stmp4.f;								
	Stmp3.f = Stmp1.f*Stmp3.f;								
	Stmp3.f = Stmp2.f*Stmp3.f;								
	Stmp1.f = __fadd_rn(Stmp1.f, Stmp4.f);					
	Stmp1.f = __fsub_rn(Stmp1.f, Stmp3.f);					

	Sch.f = Sch.f*Stmp1.f;									
	Ssh.f = Ssh.f*Stmp1.f;									

	Sc.f = Sch.f*Sch.f;										
	Ss.f = Ssh.f*Ssh.f;										
	Sc.f = __fsub_rn(Sc.f, Ss.f);							
	Ss.f = Ssh.f*Sch.f;										
	Ss.f = __fadd_rn(Ss.f, Ss.f);							

	//###########################################################
	// Rotate matrix A
	//###########################################################

	Stmp1.f = Ss.f*Sa11.f;									
	Stmp2.f = Ss.f*Sa31.f;									
	Sa11.f = Sc.f*Sa11.f;									
	Sa31.f = Sc.f*Sa31.f;									
	Sa11.f = __fadd_rn(Sa11.f, Stmp2.f);					
	Sa31.f = __fsub_rn(Sa31.f, Stmp1.f);					

	Stmp1.f = Ss.f*Sa12.f;									
	Stmp2.f = Ss.f*Sa32.f;									
	Sa12.f = Sc.f*Sa12.f;									
	Sa32.f = Sc.f*Sa32.f;									
	Sa12.f = __fadd_rn(Sa12.f, Stmp2.f);					
	Sa32.f = __fsub_rn(Sa32.f, Stmp1.f);					

	Stmp1.f = Ss.f*Sa13.f;									
	Stmp2.f = Ss.f*Sa33.f;									
	Sa13.f = Sc.f*Sa13.f;									
	Sa33.f = Sc.f*Sa33.f;									
	Sa13.f = __fadd_rn(Sa13.f, Stmp2.f);					
	Sa33.f = __fsub_rn(Sa33.f, Stmp1.f);					

	//###########################################################
	// Update matrix U
	//###########################################################

	Stmp1.f = Ss.f*Su11.f;
	Stmp2.f = Ss.f*Su13.f;
	Su11.f = Sc.f*Su11.f;
	Su13.f = Sc.f*Su13.f;
	Su11.f = __fadd_rn(Su11.f, Stmp2.f);
	Su13.f = __fsub_rn(Su13.f, Stmp1.f);

	Stmp1.f = Ss.f*Su21.f;
	Stmp2.f = Ss.f*Su23.f;
	Su21.f = Sc.f*Su21.f;
	Su23.f = Sc.f*Su23.f;
	Su21.f = __fadd_rn(Su21.f, Stmp2.f);
	Su23.f = __fsub_rn(Su23.f, Stmp1.f);

	Stmp1.f = Ss.f*Su31.f;
	Stmp2.f = Ss.f*Su33.f;
	Su31.f = Sc.f*Su31.f;
	Su33.f = Sc.f*Su33.f;
	Su31.f = __fadd_rn(Su31.f, Stmp2.f);
	Su33.f = __fsub_rn(Su33.f, Stmp1.f);

	// Third Givens Rotation

	Ssh.f = Sa32.f*Sa32.f;								
	Ssh.ui = (Ssh.f >= gsmall_number) ? 0xffffffff : 0;	
	Ssh.ui = Ssh.ui&Sa32.ui;							

	Stmp5.f = 0.f;										
	Sch.f = __fsub_rn(Stmp5.f, Sa22.f);					
	Sch.f = max(Sch.f, Sa22.f);							
	Sch.f = max(Sch.f, gsmall_number);					
	Stmp5.ui = (Sa22.f >= Stmp5.f) ? 0xffffffff : 0;	

	Stmp1.f = Sch.f*Sch.f;								
	Stmp2.f = Ssh.f*Ssh.f;								
	Stmp2.f = __fadd_rn(Stmp1.f, Stmp2.f);				
	Stmp1.f = __frsqrt_rn(Stmp2.f);						

	Stmp4.f = Stmp1.f*0.5f;							
	Stmp3.f = Stmp1.f*Stmp4.f;						
	Stmp3.f = Stmp1.f*Stmp3.f;						
	Stmp3.f = Stmp2.f*Stmp3.f;						
	Stmp1.f = __fadd_rn(Stmp1.f, Stmp4.f);			
	Stmp1.f = __fsub_rn(Stmp1.f, Stmp3.f);			
	Stmp1.f = Stmp1.f*Stmp2.f;						

	Sch.f = __fadd_rn(Sch.f, Stmp1.f);				

	Stmp1.ui = ~Stmp5.ui&Ssh.ui;					
	Stmp2.ui = ~Stmp5.ui&Sch.ui;					
	Sch.ui = Stmp5.ui&Sch.ui;						
	Ssh.ui = Stmp5.ui&Ssh.ui;						
	Sch.ui = Sch.ui | Stmp1.ui;						
	Ssh.ui = Ssh.ui | Stmp2.ui;						

	Stmp1.f = Sch.f*Sch.f;							
	Stmp2.f = Ssh.f*Ssh.f;							
	Stmp2.f = __fadd_rn(Stmp1.f, Stmp2.f);			
	Stmp1.f = __frsqrt_rn(Stmp2.f);					

	Stmp4.f = Stmp1.f*0.5f;							
	Stmp3.f = Stmp1.f*Stmp4.f;						
	Stmp3.f = Stmp1.f*Stmp3.f;						
	Stmp3.f = Stmp2.f*Stmp3.f;						
	Stmp1.f = __fadd_rn(Stmp1.f, Stmp4.f);			
	Stmp1.f = __fsub_rn(Stmp1.f, Stmp3.f);			

	Sch.f = Sch.f*Stmp1.f;							
	Ssh.f = Ssh.f*Stmp1.f;							

	Sc.f = Sch.f*Sch.f;								
	Ss.f = Ssh.f*Ssh.f;								
	Sc.f = __fsub_rn(Sc.f, Ss.f);					
	Ss.f = Ssh.f*Sch.f;								
	Ss.f = __fadd_rn(Ss.f, Ss.f);					

	//###########################################################
	// Rotate matrix A
	//###########################################################

	Stmp1.f = Ss.f*Sa21.f;									
	Stmp2.f = Ss.f*Sa31.f;									
	Sa21.f = Sc.f*Sa21.f;									
	Sa31.f = Sc.f*Sa31.f;									
	Sa21.f = __fadd_rn(Sa21.f, Stmp2.f);					
	Sa31.f = __fsub_rn(Sa31.f, Stmp1.f);					

	Stmp1.f = Ss.f*Sa22.f;									
	Stmp2.f = Ss.f*Sa32.f;									
	Sa22.f = Sc.f*Sa22.f;									
	Sa32.f = Sc.f*Sa32.f;									
	Sa22.f = __fadd_rn(Sa22.f, Stmp2.f);					
	Sa32.f = __fsub_rn(Sa32.f, Stmp1.f);					

	Stmp1.f = Ss.f*Sa23.f;									
	Stmp2.f = Ss.f*Sa33.f;									
	Sa23.f = Sc.f*Sa23.f;									
	Sa33.f = Sc.f*Sa33.f;									
	Sa23.f = __fadd_rn(Sa23.f, Stmp2.f);					
	Sa33.f = __fsub_rn(Sa33.f, Stmp1.f);					

	//###########################################################
	// Update matrix U
	//###########################################################

	Stmp1.f = Ss.f*Su12.f;
	Stmp2.f = Ss.f*Su13.f;
	Su12.f = Sc.f*Su12.f;									
	Su13.f = Sc.f*Su13.f;									
	Su12.f = __fadd_rn(Su12.f, Stmp2.f);
	Su13.f = __fsub_rn(Su13.f, Stmp1.f);

	Stmp1.f = Ss.f*Su22.f;
	Stmp2.f = Ss.f*Su23.f;
	Su22.f = Sc.f*Su22.f;
	Su23.f = Sc.f*Su23.f;
	Su22.f = __fadd_rn(Su22.f, Stmp2.f);
	Su23.f = __fsub_rn(Su23.f, Stmp1.f);

	Stmp1.f = Ss.f*Su32.f;
	Stmp2.f = Ss.f*Su33.f;
	Su32.f = Sc.f*Su32.f;
	Su33.f = Sc.f*Su33.f;
	Su32.f = __fadd_rn(Su32.f, Stmp2.f);					
	Su33.f = __fsub_rn(Su33.f, Stmp1.f);					

	v11 = Sv11.f; v12 = Sv12.f; v13 = Sv13.f;
	v21 = Sv21.f; v22 = Sv22.f; v23 = Sv23.f;
	v31 = Sv31.f; v32 = Sv32.f; v33 = Sv33.f;

	u11 = Su11.f; u12 = Su12.f; u13 = Su13.f;
	u21 = Su21.f; u22 = Su22.f; u23 = Su23.f;
	u31 = Su31.f; u32 = Su32.f; u33 = Su33.f;

	s11 = Sa11.f; 
	//s12 = Sa12.f; s13 = Sa13.f; s21 = Sa21.f; 
	s22 = Sa22.f; 
	//s23 = Sa23.f; s31 = Sa31.f; s32 = Sa32.f; 
	s33 = Sa33.f;
}
#endif
