\begin{tikzpicture}[line join=round, >={Stealth[inset=0pt,length=3.5pt,angle'=45]}, every node/.style={font=\fontsize{5}{5}\sffamily}, scale=0.6]
	\draw[semithick, black, fill=tud6a!50]  (-3.5,3) rectangle (-2.5,0);
	\node[rotate=90, align=center, text width=2.25cm] at (-3,1.5) {Transformer Encoder};
	\node[rotate=90, align=center, text width=2.25cm] at (-4.5,1.5) {ECG signal sequences};
	\draw[thick, ->] (-4,2.5) -- (-3.5,2.5);
	\draw[thick, ->] (-4,2) -- (-3.5,2);
	\draw[thick, ->] (-4,1.5) -- (-3.5,1.5);
	\draw[thick, ->] (-4,1) -- (-3.5,1);
	\draw[thick, ->] (-4,0.5) -- (-3.5,0.5);
	\draw[thick, ->] (-2.5,1.5) -- (-2,1.5);
	\node[rotate=90, align=center, text width=2.25cm] at (-1.7,1.5) {Latent vector};
	\draw[semithick, black, fill=tud3a!50] (-3.5,-1) -- (-2.5,-1.5) -- (-2.5,-4) -- (-3.5,-4.5) -- cycle;
	\node[rotate=90, align=center, text width=2.5cm] at (-3,-2.75) {ResNet block};
	\draw[semithick, black, fill=tud3a!50] (-1.5,-1.25) -- (-0.5,-1.75) -- (-0.5,-3.75) -- (-1.5,-4.25) -- cycle;
	\node[rotate=90, align=center, text width=2.25cm] at (-1.05,-2.75) {ResNet block};
	\draw[semithick, black, fill=tud3a!50] (0.5,-1.5) -- (1.5,-2) -- (1.5,-3.5) -- (0.5,-4) -- cycle;
	\node[rotate=90, align=center, text width=1.5cm] at (0.95,-2.75) {AxAtt block};
	\draw[semithick, black, fill=tud3a!50] (2.5,-1.75) -- (3.5,-2.25) -- (3.5,-3.25) -- (2.5,-3.75) -- cycle;
	\node[rotate=90, align=center, text width=0.75cm] at (2.95,-2.75) {AxAtt block};
	\draw[semithick, black, fill=tud3a!50] (4.5,-2) -- (5.5,-2.5) -- (5.5,-3) -- (4.5,-3.5) -- cycle;
	\node[rotate=90, align=center, text width=0.5cm] at (5,-2.75) {AxAtt b.};
	\draw[thick, ->] (-1.65,0.4) -- (-1.65,-0.5) -- (5,-0.5) -- (5,-2.23);
	\draw[thick, <-] (-3,-1.22) -- (-3,-0.5) -- (-1.65,-0.5);
	\draw[thick, <-] (3,-1.98) -- (3,-0.5);
	\draw[thick, <-] (1,-1.73) -- (1,-0.5);
	\draw[thick, <-] (-1,-1.47) -- (-1,-0.5);
	\draw[thick, ->] (-2.5,-2.75) -- (-1.5,-2.75);
	\draw[thick, ->] (-0.5,-2.75) -- (0.5,-2.75);
	\draw[thick, ->] (1.5,-2.75) -- (2.5,-2.75);
	\draw[thick, ->] (3.5,-2.75) -- (4.5,-2.75);
	\draw[thick, ->] (5.5,-2.75) -- (6.5,-2.75);
	\draw[fill=white] (5.95,-2.75) circle (0.2cm);
	\node at (5.95,-2.75) {$+$};
	\draw[thick, ->] (5,-0.5) -- (5.95,-0.5) -- (5.95,-2.55);
	\draw[semithick, black, fill=tud10a!50]  (6.5,-1.5) rectangle (7,-4);
	\node[rotate=90, align=center, text width=2.25cm] at (6.75,-2.75) {Linear layer};
	\draw[thick, ->] (7,-2.75) -- (8,-2.75);
	\node[rotate=90, align=center, text width=2.25cm] at (8.35,-2.75) {Classification};
	\draw[thick, ->] (-4,-2.75) -- (-3.5,-2.75);
	\node[rotate=90] at (-4.35,-2.75) {Spectrogram};
	\begin{scope}[xshift=1.5cm]
	    \draw[semithick, fill=gray!30, rounded corners=2mm]  (-1,3) rectangle (6,0);
    	\node[align=center, text width=5cm] at (2.75,2.76) {Transformer Encoder};
    	\node[rotate=90, align=center, text width=2.25cm] at (-0.65,1.4) {Input};
    	\draw[semithick, fill=gray!10] (0.55,2.5) rectangle (4.55,0.5);
    	\draw[thick, ->, >={Stealth[inset=0pt,length=2.5pt,angle'=45]}] (-0.425,1.4) -- (1.25,1.4);
    	\draw[fill=white] (0.05,1.4) circle (0.2cm);
    	\node at (0.05,1.4) {$+$};
    	\draw[thick, <-, >={Stealth[inset=0pt,length=2.5pt,angle'=45]}] (0.05,1.6) -- (0.05,2.1);
    	\node at (-0.2,2.3) {Pos. Enc.};
    	\draw[fill=tud6a!50]  (1.25,0.75) rectangle (2,2.05);
    	\node[rotate=90, align=center, text width=1cm, font=\fontsize{4}{4}\sffamily] at (1.6,1.4) {Multi-Head attention};
    	\draw[fill=tud5a!50]  (2.15,0.75) rectangle (2.5,2.05);
    	\node[rotate=90, align=center, text width=2.25cm, font=\fontsize{4}{4}\sffamily] at (2.325,1.4) {Add \& LN};
    	\draw[thick] (2,1.4) -- (2.15,1.4);
    	\draw[thick, ->, >={Stealth[inset=0pt,length=2.5pt,angle'=45]}] (0.9,1.4) -- (0.9,1.8) -- (1.24,1.8);
    	\draw[thick, ->, >={Stealth[inset=0pt,length=2.5pt,angle'=45]}] (0.9,1.4) -- (0.9,1) -- (1.24,1);
    	\draw[thick, ->, >={Stealth[inset=0pt,length=2.5pt,angle'=45]}] (0.74,1.4) -- (0.74,2.26) -- (2.34,2.26) -- (2.34,2.06);
    	\begin{scope}[shift={(1.66,0)}]
    		\draw[thick, ->, >={Stealth[inset=0pt,length=2.5pt,angle'=45]}] (1,1.4) -- (1,2.26) -- (2.34,2.26) -- (2.34,2.06);
    		\draw[fill=tud2a!50]  (1.25,0.75) rectangle (2,2.05);
    		\node[rotate=90, align=center, text width=0.7cm, font=\fontsize{4}{4}\sffamily] at (1.6,1.4) {Feed forward};
    		\draw[fill=tud5a!50]  (2.15,0.75) rectangle (2.5,2.05);
    		\node[rotate=90, align=center, text width=2.25cm, font=\fontsize{4}{4}\sffamily] at (2.325,1.4) {Add \& LN};
    		\draw[thick] (2,1.4) -- (2.15,1.4);
    	\end{scope}
    	\draw[thick, ->, >={Stealth[inset=0pt,length=2.5pt,angle'=45]}] (2.5,1.4) -- (2.9,1.4);
    	\draw[thick, ->, >={Stealth[inset=0pt,length=2.5pt,angle'=45]}] (4.16,1.4) -- (5.06,1.4);
    	\draw[fill=tud5a!50]  (5.07,0.75) rectangle (5.42,2.05);
    	\node[rotate=90, align=center, text width=2.25cm, font=\fontsize{4}{4}\sffamily] at (5.245,1.4) {Avg. pool};
    	\draw[thick, ->, >={Stealth[inset=0pt,length=2.5pt,angle'=45]}] (5.42,1.4) -- (6.2,1.4);
    	\node[align=center, text width=2.25cm] at (2.725,0.24) {N $\times$};
	\end{scope}
\end{tikzpicture}