1 /*
2 * Copyright 2006-2010, Haiku, Inc. All Rights Reserved.
3 * Distributed under the terms of the MIT License.
4 *
5 * Authors:
6 * Andrew Galante, haiku.galante@gmail.com
7 * Axel Dörfler, axeld@pinc-software.de
8 * Hugo Santos, hugosantos@gmail.com
9 */
10
11
12 #include "TCPEndpoint.h"
13
14 #include <netinet/in.h>
15 #include <netinet/ip.h>
16 #include <netinet/tcp.h>
17 #include <new>
18 #include <signal.h>
19 #include <stdlib.h>
20 #include <string.h>
21 #include <stdint.h>
22
23 #include <KernelExport.h>
24 #include <Select.h>
25
26 #include <net_buffer.h>
27 #include <net_datalink.h>
28 #include <net_stat.h>
29 #include <NetBufferUtilities.h>
30 #include <NetUtilities.h>
31
32 #include <lock.h>
33 #include <low_resource_manager.h>
34 #include <tracing.h>
35 #include <util/AutoLock.h>
36 #include <util/list.h>
37
38 #include "EndpointManager.h"
39
40
41 // References:
42 // - RFC 793 - Transmission Control Protocol
43 // - RFC 813 - Window and Acknowledgement Strategy in TCP
44 // - RFC 1337 - TIME_WAIT Assassination Hazards in TCP
45 //
46 // Things incomplete in this implementation:
47 // - TCP Extensions for High Performance, RFC 1323 - RTTM, PAWS
48 // - Congestion Control, RFC 5681
49 // - Limited Transit, RFC 3042
50 // - SACK, Selective Acknowledgment; RFC 2018, RFC 2883, RFC 6675
51 // - NewReno Modification to TCP's Fast Recovery, RFC 2582
52 //
53 // Things this implementation currently doesn't implement:
54 // - Explicit Congestion Notification (ECN), RFC 3168
55 // - SYN-Cache
56 // - Forward RTO-Recovery, RFC 4138
57 // - Time-Wait hash instead of keeping sockets alive
58
59 #define PrintAddress(address) \
60 AddressString(Domain(), address, true).Data()
61
62 //#define TRACE_TCP
63 //#define PROBE_TCP
64
65 #ifdef TRACE_TCP
66 // the space before ', ##args' is important in order for this to work with cpp 2.95
67 # define TRACE(format, args...) dprintf("%" B_PRId32 ": TCP [%" \
68 B_PRIdBIGTIME "] %p (%12s) " format "\n", find_thread(NULL), \
69 system_time(), this, name_for_state(fState) , ##args)
70 #else
71 # define TRACE(args...) do { } while (0)
72 #endif
73
74 #ifdef PROBE_TCP
75 # define PROBE(buffer, window) \
76 dprintf("TCP PROBE %" B_PRIdBIGTIME " %s %s %" B_PRIu32 " snxt %" B_PRIu32 \
77 " suna %" B_PRIu32 " cw %" B_PRIu32 " sst %" B_PRIu32 " win %" \
78 B_PRIu32 " swin %" B_PRIu32 " smax-suna %" B_PRIu32 " savail %" \
79 B_PRIuSIZE " sqused %" B_PRIuSIZE " rto %" B_PRIdBIGTIME "\n", \
80 system_time(), PrintAddress(buffer->source), \
81 PrintAddress(buffer->destination), buffer->size, fSendNext.Number(), \
82 fSendUnacknowledged.Number(), fCongestionWindow, fSlowStartThreshold, \
83 window, fSendWindow, (fSendMax - fSendUnacknowledged).Number(), \
84 fSendQueue.Available(fSendNext), fSendQueue.Used(), fRetransmitTimeout)
85 #else
86 # define PROBE(buffer, window) do { } while (0)
87 #endif
88
89 #if TCP_TRACING
90 namespace TCPTracing {
91
92 class Receive : public AbstractTraceEntry {
93 public:
Receive(TCPEndpoint * endpoint,tcp_segment_header & segment,uint32 window,net_buffer * buffer)94 Receive(TCPEndpoint* endpoint, tcp_segment_header& segment, uint32 window,
95 net_buffer* buffer)
96 :
97 fEndpoint(endpoint),
98 fBuffer(buffer),
99 fBufferSize(buffer->size),
100 fSequence(segment.sequence),
101 fAcknowledge(segment.acknowledge),
102 fWindow(window),
103 fState(endpoint->State()),
104 fFlags(segment.flags)
105 {
106 Initialized();
107 }
108
AddDump(TraceOutput & out)109 virtual void AddDump(TraceOutput& out)
110 {
111 out.Print("tcp:%p (%12s) receive buffer %p (%" B_PRIu32 " bytes), "
112 "flags %#" B_PRIx8 ", seq %" B_PRIu32 ", ack %" B_PRIu32
113 ", wnd %" B_PRIu32, fEndpoint, name_for_state(fState), fBuffer,
114 fBufferSize, fFlags, fSequence, fAcknowledge, fWindow);
115 }
116
117 protected:
118 TCPEndpoint* fEndpoint;
119 net_buffer* fBuffer;
120 uint32 fBufferSize;
121 uint32 fSequence;
122 uint32 fAcknowledge;
123 uint32 fWindow;
124 tcp_state fState;
125 uint8 fFlags;
126 };
127
128 class Send : public AbstractTraceEntry {
129 public:
Send(TCPEndpoint * endpoint,tcp_segment_header & segment,net_buffer * buffer,tcp_sequence firstSequence,tcp_sequence lastSequence)130 Send(TCPEndpoint* endpoint, tcp_segment_header& segment, net_buffer* buffer,
131 tcp_sequence firstSequence, tcp_sequence lastSequence)
132 :
133 fEndpoint(endpoint),
134 fBuffer(buffer),
135 fBufferSize(buffer->size),
136 fSequence(segment.sequence),
137 fAcknowledge(segment.acknowledge),
138 fFirstSequence(firstSequence.Number()),
139 fLastSequence(lastSequence.Number()),
140 fState(endpoint->State()),
141 fFlags(segment.flags)
142 {
143 Initialized();
144 }
145
AddDump(TraceOutput & out)146 virtual void AddDump(TraceOutput& out)
147 {
148 out.Print("tcp:%p (%12s) send buffer %p (%" B_PRIu32 " bytes), "
149 "flags %#" B_PRIx8 ", seq %" B_PRIu32 ", ack %" B_PRIu32
150 ", first %" B_PRIu32 ", last %" B_PRIu32, fEndpoint,
151 name_for_state(fState), fBuffer, fBufferSize, fFlags, fSequence,
152 fAcknowledge, fFirstSequence, fLastSequence);
153 }
154
155 protected:
156 TCPEndpoint* fEndpoint;
157 net_buffer* fBuffer;
158 uint32 fBufferSize;
159 uint32 fSequence;
160 uint32 fAcknowledge;
161 uint32 fFirstSequence;
162 uint32 fLastSequence;
163 tcp_state fState;
164 uint8 fFlags;
165 };
166
167 class State : public AbstractTraceEntry {
168 public:
State(TCPEndpoint * endpoint)169 State(TCPEndpoint* endpoint)
170 :
171 fEndpoint(endpoint),
172 fState(endpoint->State())
173 {
174 Initialized();
175 }
176
AddDump(TraceOutput & out)177 virtual void AddDump(TraceOutput& out)
178 {
179 out.Print("tcp:%p (%12s) state change", fEndpoint,
180 name_for_state(fState));
181 }
182
183 protected:
184 TCPEndpoint* fEndpoint;
185 tcp_state fState;
186 };
187
188 class Spawn : public AbstractTraceEntry {
189 public:
Spawn(TCPEndpoint * listeningEndpoint,TCPEndpoint * spawnedEndpoint)190 Spawn(TCPEndpoint* listeningEndpoint, TCPEndpoint* spawnedEndpoint)
191 :
192 fListeningEndpoint(listeningEndpoint),
193 fSpawnedEndpoint(spawnedEndpoint)
194 {
195 Initialized();
196 }
197
AddDump(TraceOutput & out)198 virtual void AddDump(TraceOutput& out)
199 {
200 out.Print("tcp:%p spawns %p", fListeningEndpoint, fSpawnedEndpoint);
201 }
202
203 protected:
204 TCPEndpoint* fListeningEndpoint;
205 TCPEndpoint* fSpawnedEndpoint;
206 };
207
208 class Error : public AbstractTraceEntry {
209 public:
Error(TCPEndpoint * endpoint,const char * error,int32 line)210 Error(TCPEndpoint* endpoint, const char* error, int32 line)
211 :
212 fEndpoint(endpoint),
213 fLine(line),
214 fError(error),
215 fState(endpoint->State())
216 {
217 Initialized();
218 }
219
AddDump(TraceOutput & out)220 virtual void AddDump(TraceOutput& out)
221 {
222 out.Print("tcp:%p (%12s) error at line %" B_PRId32 ": %s", fEndpoint,
223 name_for_state(fState), fLine, fError);
224 }
225
226 protected:
227 TCPEndpoint* fEndpoint;
228 int32 fLine;
229 const char* fError;
230 tcp_state fState;
231 };
232
233 class TimerSet : public AbstractTraceEntry {
234 public:
TimerSet(TCPEndpoint * endpoint,const char * which,bigtime_t timeout)235 TimerSet(TCPEndpoint* endpoint, const char* which, bigtime_t timeout)
236 :
237 fEndpoint(endpoint),
238 fWhich(which),
239 fTimeout(timeout),
240 fState(endpoint->State())
241 {
242 Initialized();
243 }
244
AddDump(TraceOutput & out)245 virtual void AddDump(TraceOutput& out)
246 {
247 out.Print("tcp:%p (%12s) %s timer set to %" B_PRIdBIGTIME, fEndpoint,
248 name_for_state(fState), fWhich, fTimeout);
249 }
250
251 protected:
252 TCPEndpoint* fEndpoint;
253 const char* fWhich;
254 bigtime_t fTimeout;
255 tcp_state fState;
256 };
257
258 class TimerTriggered : public AbstractTraceEntry {
259 public:
TimerTriggered(TCPEndpoint * endpoint,const char * which)260 TimerTriggered(TCPEndpoint* endpoint, const char* which)
261 :
262 fEndpoint(endpoint),
263 fWhich(which),
264 fState(endpoint->State())
265 {
266 Initialized();
267 }
268
AddDump(TraceOutput & out)269 virtual void AddDump(TraceOutput& out)
270 {
271 out.Print("tcp:%p (%12s) %s timer triggered", fEndpoint,
272 name_for_state(fState), fWhich);
273 }
274
275 protected:
276 TCPEndpoint* fEndpoint;
277 const char* fWhich;
278 tcp_state fState;
279 };
280
281 class APICall : public AbstractTraceEntry {
282 public:
APICall(TCPEndpoint * endpoint,const char * which)283 APICall(TCPEndpoint* endpoint, const char* which)
284 :
285 fEndpoint(endpoint),
286 fWhich(which),
287 fState(endpoint->State())
288 {
289 Initialized();
290 }
291
AddDump(TraceOutput & out)292 virtual void AddDump(TraceOutput& out)
293 {
294 out.Print("tcp:%p (%12s) api call: %s", fEndpoint,
295 name_for_state(fState), fWhich);
296 }
297
298 protected:
299 TCPEndpoint* fEndpoint;
300 const char* fWhich;
301 tcp_state fState;
302 };
303
304 } // namespace TCPTracing
305
306 # define T(x) new(std::nothrow) TCPTracing::x
307 #else
308 # define T(x)
309 #endif // TCP_TRACING
310
311
312 // constants for the fFlags field
313 enum {
314 FLAG_OPTION_WINDOW_SCALE = 0x01,
315 FLAG_OPTION_TIMESTAMP = 0x02,
316 // TODO: Should FLAG_NO_RECEIVE apply as well to received connections?
317 // That is, what is expected from accept() after a shutdown()
318 // is performed on a listen()ing socket.
319 FLAG_NO_RECEIVE = 0x04,
320 FLAG_CLOSED = 0x08,
321 FLAG_DELETE_ON_CLOSE = 0x10,
322 FLAG_LOCAL = 0x20,
323 FLAG_RECOVERY = 0x40,
324 FLAG_OPTION_SACK_PERMITTED = 0x80,
325 FLAG_AUTO_RECEIVE_BUFFER_SIZE = 0x100,
326 };
327
328
329 static const int kTimestampFactor = 1000;
330 // conversion factor between usec system time and msec tcp time
331
332
333 static inline bigtime_t
absolute_timeout(bigtime_t timeout)334 absolute_timeout(bigtime_t timeout)
335 {
336 if (timeout == 0 || timeout == B_INFINITE_TIMEOUT)
337 return timeout;
338
339 return timeout + system_time();
340 }
341
342
343 static inline status_t
posix_error(status_t error)344 posix_error(status_t error)
345 {
346 if (error == B_TIMED_OUT)
347 return B_WOULD_BLOCK;
348
349 return error;
350 }
351
352
353 static inline bool
in_window(const tcp_sequence & sequence,const tcp_sequence & receiveNext,uint32 receiveWindow)354 in_window(const tcp_sequence& sequence, const tcp_sequence& receiveNext,
355 uint32 receiveWindow)
356 {
357 return sequence >= receiveNext && sequence < (receiveNext + receiveWindow);
358 }
359
360
361 static inline bool
segment_in_sequence(const tcp_segment_header & segment,int size,const tcp_sequence & receiveNext,uint32 receiveWindow)362 segment_in_sequence(const tcp_segment_header& segment, int size,
363 const tcp_sequence& receiveNext, uint32 receiveWindow)
364 {
365 tcp_sequence sequence(segment.sequence);
366 if (size == 0) {
367 if (receiveWindow == 0)
368 return sequence == receiveNext;
369 return in_window(sequence, receiveNext, receiveWindow);
370 } else {
371 if (receiveWindow == 0)
372 return false;
373 return in_window(sequence, receiveNext, receiveWindow)
374 || in_window(sequence + size - 1, receiveNext, receiveWindow);
375 }
376 }
377
378
379 static inline bool
is_writable(tcp_state state)380 is_writable(tcp_state state)
381 {
382 return state == ESTABLISHED || state == FINISH_RECEIVED;
383 }
384
385
386 static inline bool
is_establishing(tcp_state state)387 is_establishing(tcp_state state)
388 {
389 return state == SYNCHRONIZE_SENT || state == SYNCHRONIZE_RECEIVED;
390 }
391
392
393 static inline uint32
tcp_now()394 tcp_now()
395 {
396 return system_time() / kTimestampFactor;
397 }
398
399
400 static inline uint32
tcp_diff_timestamp(uint32 base)401 tcp_diff_timestamp(uint32 base)
402 {
403 const uint32 now = tcp_now();
404 if (now >= base)
405 return now - base;
406
407 return now + UINT_MAX - base;
408 }
409
410
411 static inline bool
state_needs_finish(int32 state)412 state_needs_finish(int32 state)
413 {
414 return state == WAIT_FOR_FINISH_ACKNOWLEDGE
415 || state == FINISH_SENT || state == CLOSING;
416 }
417
418
419 // #pragma mark -
420
421
TCPEndpoint(net_socket * socket)422 TCPEndpoint::TCPEndpoint(net_socket* socket)
423 :
424 ProtocolSocket(socket),
425 fManager(NULL),
426 fOptions(0),
427 fSendWindowShift(0),
428 fReceiveWindowShift(0),
429 fSendUnacknowledged(0),
430 fSendNext(0),
431 fSendMax(0),
432 fSendUrgentOffset(0),
433 fSendWindow(0),
434 fSendMaxWindow(0),
435 fSendMaxSegmentSize(TCP_DEFAULT_MAX_SEGMENT_SIZE),
436 fSendMaxSegments(0),
437 fSendQueue(socket->send.buffer_size),
438 fInitialSendSequence(0),
439 fPreviousHighestAcknowledge(0),
440 fDuplicateAcknowledgeCount(0),
441 fPreviousFlightSize(0),
442 fRecover(0),
443 fRoute(NULL),
444 fReceiveNext(0),
445 fReceiveMaxAdvertised(0),
446 fReceiveWindow(socket->receive.buffer_size),
447 fReceiveMaxSegmentSize(TCP_DEFAULT_MAX_SEGMENT_SIZE),
448 fReceiveQueue(socket->receive.buffer_size),
449 fSmoothedRoundTripTime(-1),
450 fRoundTripVariation(0),
451 fSendTime(0),
452 fRoundTripStartSequence(0),
453 fRetransmitTimeout(TCP_INITIAL_RTT),
454 fReceivedTimestamp(0),
455 fCongestionWindow(0),
456 fSlowStartThreshold(0),
457 fState(CLOSED),
458 fFlags(FLAG_OPTION_WINDOW_SCALE | FLAG_OPTION_TIMESTAMP
459 | FLAG_OPTION_SACK_PERMITTED | FLAG_AUTO_RECEIVE_BUFFER_SIZE)
460 {
461 // TODO: to be replaced with a real read/write locking strategy!
462 mutex_init(&fLock, "tcp lock");
463
464 fReceiveCondition.Init(this, "tcp receive");
465 fSendCondition.Init(this, "tcp send");
466
467 gStackModule->init_timer(&fPersistTimer, TCPEndpoint::_PersistTimer, this);
468 gStackModule->init_timer(&fRetransmitTimer, TCPEndpoint::_RetransmitTimer,
469 this);
470 gStackModule->init_timer(&fDelayedAcknowledgeTimer,
471 TCPEndpoint::_DelayedAcknowledgeTimer, this);
472 gStackModule->init_timer(&fTimeWaitTimer, TCPEndpoint::_TimeWaitTimer,
473 this);
474
475 T(APICall(this, "constructor"));
476 }
477
478
~TCPEndpoint()479 TCPEndpoint::~TCPEndpoint()
480 {
481 mutex_lock(&fLock);
482
483 T(APICall(this, "destructor"));
484
485 _CancelConnectionTimers();
486 gStackModule->cancel_timer(&fTimeWaitTimer);
487 T(TimerSet(this, "time-wait", -1));
488
489 if (fManager != NULL) {
490 fManager->Unbind(this);
491 put_endpoint_manager(fManager);
492 }
493
494 mutex_destroy(&fLock);
495
496 // we need to wait for all timers to return
497 gStackModule->wait_for_timer(&fRetransmitTimer);
498 gStackModule->wait_for_timer(&fPersistTimer);
499 gStackModule->wait_for_timer(&fDelayedAcknowledgeTimer);
500 gStackModule->wait_for_timer(&fTimeWaitTimer);
501
502 gDatalinkModule->put_route(Domain(), fRoute);
503 }
504
505
506 status_t
InitCheck() const507 TCPEndpoint::InitCheck() const
508 {
509 return B_OK;
510 }
511
512
513 // #pragma mark - protocol API
514
515
516 status_t
Open()517 TCPEndpoint::Open()
518 {
519 TRACE("Open()");
520 T(APICall(this, "open"));
521
522 status_t status = ProtocolSocket::Open();
523 if (status < B_OK)
524 return status;
525
526 fManager = get_endpoint_manager(Domain());
527 if (fManager == NULL)
528 return EAFNOSUPPORT;
529
530 return B_OK;
531 }
532
533
534 status_t
Close()535 TCPEndpoint::Close()
536 {
537 MutexLocker locker(fLock);
538
539 TRACE("Close()");
540 T(APICall(this, "close"));
541
542 if (fState == LISTEN)
543 delete_sem(fAcceptSemaphore);
544
545 if (fState == SYNCHRONIZE_SENT || fState == LISTEN) {
546 // TODO: what about linger in case of SYNCHRONIZE_SENT?
547 fState = CLOSED;
548 T(State(this));
549 return B_OK;
550 }
551
552 // handle linger with zero timeout
553 if ((socket->options & SO_LINGER) != 0 && socket->linger == 0) {
554 fState = CLOSED;
555 T(State(this));
556 return _SendQueued(true);
557 }
558
559 status_t status = _Disconnect(true);
560 if (status != B_OK)
561 return status;
562
563 if ((socket->options & SO_LINGER) != 0) {
564 TRACE("Close(): Lingering for %i secs", socket->linger);
565
566 bigtime_t maximum = absolute_timeout(socket->linger * 1000000LL);
567
568 while (fSendQueue.Used() > 0) {
569 status = _WaitForCondition(fSendCondition, locker, maximum);
570 if (status == B_TIMED_OUT || status == B_WOULD_BLOCK)
571 break;
572 else if (status < B_OK)
573 return status;
574 }
575
576 TRACE("Close(): after waiting, the SendQ was left with %" B_PRIuSIZE
577 " bytes.", fSendQueue.Used());
578 }
579 return B_OK;
580 }
581
582
583 void
Free()584 TCPEndpoint::Free()
585 {
586 MutexLocker _(fLock);
587
588 TRACE("Free()");
589 T(APICall(this, "free"));
590
591 if (fState <= SYNCHRONIZE_SENT)
592 return;
593
594 fFlags |= FLAG_CLOSED;
595 if ((fFlags & FLAG_DELETE_ON_CLOSE) == 0) {
596 // we'll be freed later when the 2MSL timer expires
597 gSocketModule->acquire_socket(socket);
598
599 // we are only interested in the timer, not in changing state
600 _EnterTimeWait();
601 }
602 }
603
604
605 /*! Creates and sends a synchronize packet to /a address, and then waits
606 until the connection has been established or refused.
607 */
608 status_t
Connect(const sockaddr * address)609 TCPEndpoint::Connect(const sockaddr* address)
610 {
611 if (!AddressModule()->is_same_family(address))
612 return EAFNOSUPPORT;
613
614 MutexLocker locker(fLock);
615
616 TRACE("Connect() on address %s", PrintAddress(address));
617 T(APICall(this, "connect"));
618
619 if (gStackModule->is_restarted_syscall()) {
620 bigtime_t timeout = gStackModule->restore_syscall_restart_timeout();
621 status_t status = _WaitForEstablished(locker, timeout);
622 TRACE(" Connect(): Connection complete: %s (timeout was %"
623 B_PRIdBIGTIME ")", strerror(status), timeout);
624 return posix_error(status);
625 }
626
627 // Can only call connect() from CLOSED or LISTEN states
628 // otherwise endpoint is considered already connected
629 if (fState == LISTEN) {
630 // this socket is about to connect; remove pending connections in the backlog
631 gSocketModule->set_max_backlog(socket, 0);
632 } else if (fState == ESTABLISHED) {
633 return EISCONN;
634 } else if (fState != CLOSED)
635 return EALREADY;
636
637 // consider destination address INADDR_ANY as INADDR_LOOPBACK
638 sockaddr_storage _address;
639 if (AddressModule()->is_empty_address(address, false)) {
640 AddressModule()->get_loopback_address((sockaddr *)&_address);
641 // for IPv4 and IPv6 the port is at the same offset
642 ((sockaddr_in &)_address).sin_port = ((sockaddr_in *)address)->sin_port;
643 address = (sockaddr *)&_address;
644 }
645
646 status_t status = _PrepareSendPath(address);
647 if (status < B_OK)
648 return status;
649
650 TRACE(" Connect(): starting 3-way handshake...");
651
652 fState = SYNCHRONIZE_SENT;
653 T(State(this));
654
655 // send SYN
656 status = _SendAcknowledge();
657 if (status != B_OK) {
658 _Close();
659 return status;
660 }
661
662 // If we are running over Loopback, after _SendQueued() returns we
663 // may be in ESTABLISHED already.
664 if (fState == ESTABLISHED) {
665 TRACE(" Connect() completed after _SendQueued()");
666 return B_OK;
667 }
668
669 // wait until 3-way handshake is complete (if needed)
670 bigtime_t timeout = min_c(socket->send.timeout, TCP_CONNECTION_TIMEOUT);
671 if (timeout == 0) {
672 // we're a non-blocking socket
673 TRACE(" Connect() delayed, return EINPROGRESS");
674 return EINPROGRESS;
675 }
676
677 bigtime_t absoluteTimeout = absolute_timeout(timeout);
678 gStackModule->store_syscall_restart_timeout(absoluteTimeout);
679
680 status = _WaitForEstablished(locker, absoluteTimeout);
681 TRACE(" Connect(): Connection complete: %s (timeout was %" B_PRIdBIGTIME
682 ")", strerror(status), timeout);
683 return posix_error(status);
684 }
685
686
687 status_t
Accept(struct net_socket ** _acceptedSocket)688 TCPEndpoint::Accept(struct net_socket** _acceptedSocket)
689 {
690 MutexLocker locker(fLock);
691
692 TRACE("Accept()");
693 T(APICall(this, "accept"));
694
695 status_t status;
696 bigtime_t timeout = absolute_timeout(socket->receive.timeout);
697 if (gStackModule->is_restarted_syscall())
698 timeout = gStackModule->restore_syscall_restart_timeout();
699 else
700 gStackModule->store_syscall_restart_timeout(timeout);
701
702 do {
703 locker.Unlock();
704
705 status = acquire_sem_etc(fAcceptSemaphore, 1, B_ABSOLUTE_TIMEOUT
706 | B_CAN_INTERRUPT, timeout);
707 if (status != B_OK) {
708 if (status == B_TIMED_OUT && socket->receive.timeout == 0)
709 return B_WOULD_BLOCK;
710
711 return status;
712 }
713
714 locker.Lock();
715 status = gSocketModule->dequeue_connected(socket, _acceptedSocket);
716 #ifdef TRACE_TCP
717 if (status == B_OK)
718 TRACE(" Accept() returning %p", (*_acceptedSocket)->first_protocol);
719 #endif
720 } while (status != B_OK);
721
722 return status;
723 }
724
725
726 status_t
Bind(const sockaddr * address)727 TCPEndpoint::Bind(const sockaddr *address)
728 {
729 if (address == NULL)
730 return B_BAD_VALUE;
731
732 MutexLocker lock(fLock);
733
734 TRACE("Bind() on address %s", PrintAddress(address));
735 T(APICall(this, "bind"));
736
737 if (fState != CLOSED)
738 return EISCONN;
739
740 return fManager->Bind(this, address);
741 }
742
743
744 status_t
Unbind(struct sockaddr * address)745 TCPEndpoint::Unbind(struct sockaddr *address)
746 {
747 MutexLocker _(fLock);
748
749 TRACE("Unbind()");
750 T(APICall(this, "unbind"));
751
752 return fManager->Unbind(this);
753 }
754
755
756 status_t
Listen(int count)757 TCPEndpoint::Listen(int count)
758 {
759 MutexLocker _(fLock);
760
761 TRACE("Listen()");
762 T(APICall(this, "listen"));
763
764 if (fState != CLOSED && fState != LISTEN)
765 return B_BAD_VALUE;
766
767 if (fState == CLOSED) {
768 fAcceptSemaphore = create_sem(0, "tcp accept");
769 if (fAcceptSemaphore < B_OK)
770 return ENOBUFS;
771
772 status_t status = fManager->SetPassive(this);
773 if (status != B_OK) {
774 delete_sem(fAcceptSemaphore);
775 fAcceptSemaphore = -1;
776 return status;
777 }
778 }
779
780 gSocketModule->set_max_backlog(socket, count);
781
782 fState = LISTEN;
783 T(State(this));
784 return B_OK;
785 }
786
787
788 status_t
Shutdown(int direction)789 TCPEndpoint::Shutdown(int direction)
790 {
791 MutexLocker lock(fLock);
792
793 TRACE("Shutdown(%i)", direction);
794 T(APICall(this, "shutdown"));
795
796 if (direction == SHUT_RD || direction == SHUT_RDWR)
797 fFlags |= FLAG_NO_RECEIVE;
798
799 if (direction == SHUT_WR || direction == SHUT_RDWR) {
800 // TODO: That's not correct. After read/write shutting down the socket
801 // one should still be able to read previously arrived data.
802 _Disconnect(false);
803 }
804
805 return B_OK;
806 }
807
808
809 /*! Puts data contained in \a buffer into send buffer */
810 status_t
SendData(net_buffer * buffer)811 TCPEndpoint::SendData(net_buffer *buffer)
812 {
813 MutexLocker lock(fLock);
814
815 TRACE("SendData(buffer %p, size %" B_PRIu32 ", flags %#" B_PRIx32
816 ") [total %" B_PRIuSIZE " bytes, has %" B_PRIuSIZE "]", buffer,
817 buffer->size, buffer->flags, fSendQueue.Size(), fSendQueue.Free());
818 T(APICall(this, "senddata"));
819
820 const uint32 flags = buffer->msg_flags;
821 if ((flags & ~(MSG_DONTWAIT | MSG_OOB | MSG_EOF)) != 0)
822 return EOPNOTSUPP;
823
824 if (fState == CLOSED)
825 return ENOTCONN;
826 if (fState == LISTEN)
827 return EDESTADDRREQ;
828 if (!is_writable(fState) && !is_establishing(fState))
829 return EPIPE;
830
831 size_t left = buffer->size;
832
833 bigtime_t timeout = 0;
834 if ((flags & MSG_DONTWAIT) == 0) {
835 timeout = absolute_timeout(socket->send.timeout);
836 if (gStackModule->is_restarted_syscall())
837 timeout = gStackModule->restore_syscall_restart_timeout();
838 else
839 gStackModule->store_syscall_restart_timeout(timeout);
840 }
841
842 while (left > 0) {
843 while (fSendQueue.Free() < socket->send.low_water_mark) {
844 // initiate a send before waiting
845 _SendQueued();
846
847 // wait until enough space is available
848 status_t status = _WaitForCondition(fSendCondition, lock, timeout);
849 if (status < B_OK) {
850 TRACE(" SendData() returning %s (%d)",
851 strerror(posix_error(status)), (int)posix_error(status));
852 return posix_error(status);
853 }
854
855 if (!is_writable(fState) && !is_establishing(fState))
856 return EPIPE;
857 }
858
859 size_t size = fSendQueue.Free();
860 if (size < left) {
861 // we need to split the original buffer
862 net_buffer* clone = gBufferModule->clone(buffer, false);
863 // TODO: add offset/size parameter to net_buffer::clone() or
864 // even a move_data() function, as this is a bit inefficient
865 if (clone == NULL)
866 return ENOBUFS;
867
868 status_t status = gBufferModule->trim(clone, size);
869 if (status != B_OK) {
870 gBufferModule->free(clone);
871 return status;
872 }
873
874 gBufferModule->remove_header(buffer, size);
875 left -= size;
876 fSendQueue.Add(clone);
877 } else {
878 left -= buffer->size;
879 fSendQueue.Add(buffer);
880 }
881 }
882
883 TRACE(" SendData(): %" B_PRIuSIZE " bytes used.", fSendQueue.Used());
884
885 bool force = false;
886 if ((flags & MSG_OOB) != 0) {
887 fSendUrgentOffset = fSendQueue.LastSequence();
888 // RFC 961 specifies that the urgent offset points to the last
889 // byte of urgent data. However, this is commonly implemented as
890 // here, ie. it points to the first byte after the urgent data.
891 force = true;
892 }
893 if ((flags & MSG_EOF) != 0)
894 _Disconnect(false);
895
896 _SendQueued(force);
897
898 return B_OK;
899 }
900
901
902 ssize_t
SendAvailable()903 TCPEndpoint::SendAvailable()
904 {
905 MutexLocker locker(fLock);
906
907 ssize_t available;
908
909 if (is_writable(fState))
910 available = fSendQueue.Free();
911 else if (is_establishing(fState))
912 available = 0;
913 else
914 available = EPIPE;
915
916 TRACE("SendAvailable(): %" B_PRIdSSIZE, available);
917 T(APICall(this, "sendavailable"));
918 return available;
919 }
920
921
922 status_t
FillStat(net_stat * stat)923 TCPEndpoint::FillStat(net_stat *stat)
924 {
925 MutexLocker _(fLock);
926
927 strlcpy(stat->state, name_for_state(fState), sizeof(stat->state));
928 stat->receive_queue_size = fReceiveQueue.Available();
929 stat->send_queue_size = fSendQueue.Used();
930
931 return B_OK;
932 }
933
934
935 status_t
ReadData(size_t numBytes,uint32 flags,net_buffer ** _buffer)936 TCPEndpoint::ReadData(size_t numBytes, uint32 flags, net_buffer** _buffer)
937 {
938 if ((flags & ~(MSG_DONTWAIT | MSG_WAITALL | MSG_PEEK)) != 0)
939 return EOPNOTSUPP;
940
941 MutexLocker locker(fLock);
942
943 TRACE("ReadData(%" B_PRIuSIZE " bytes, flags %#" B_PRIx32 ")", numBytes,
944 flags);
945 T(APICall(this, "readdata"));
946
947 *_buffer = NULL;
948
949 if (fState == CLOSED || fState == LISTEN) {
950 if (socket->error != B_OK)
951 return socket->error;
952 return ENOTCONN;
953 }
954
955 bigtime_t timeout = 0;
956 if ((flags & MSG_DONTWAIT) == 0) {
957 timeout = absolute_timeout(socket->receive.timeout);
958 if (gStackModule->is_restarted_syscall())
959 timeout = gStackModule->restore_syscall_restart_timeout();
960 else
961 gStackModule->store_syscall_restart_timeout(timeout);
962 }
963
964 if (fState == SYNCHRONIZE_SENT || fState == SYNCHRONIZE_RECEIVED) {
965 if (flags & MSG_DONTWAIT)
966 return B_WOULD_BLOCK;
967
968 status_t status = _WaitForEstablished(locker, timeout);
969 if (status < B_OK)
970 return posix_error(status);
971 }
972
973 size_t dataNeeded = socket->receive.low_water_mark;
974
975 // When MSG_WAITALL is set then the function should block
976 // until the full amount of data can be returned.
977 if (flags & MSG_WAITALL)
978 dataNeeded = numBytes;
979
980 // TODO: add support for urgent data (MSG_OOB)
981
982 while (true) {
983 if (fState == CLOSING || fState == WAIT_FOR_FINISH_ACKNOWLEDGE
984 || fState == TIME_WAIT) {
985 // ``Connection closing''.
986 if (fReceiveQueue.Available() > 0)
987 break;
988 return B_OK;
989 }
990
991 if (fReceiveQueue.Available() > 0) {
992 if (fReceiveQueue.Available() >= dataNeeded
993 || (fReceiveQueue.PushedData() > 0
994 && fReceiveQueue.PushedData() >= fReceiveQueue.Available()))
995 break;
996 } else if (fState == FINISH_RECEIVED) {
997 // ``If no text is awaiting delivery, the RECEIVE will
998 // get a Connection closing''.
999 return B_OK;
1000 }
1001
1002 if (timeout == 0)
1003 return B_WOULD_BLOCK;
1004
1005 if ((fFlags & FLAG_NO_RECEIVE) != 0)
1006 return B_OK;
1007
1008 status_t status = _WaitForCondition(fReceiveCondition, locker, timeout);
1009 if (status < B_OK) {
1010 // The Open Group base specification mentions that EINTR should be
1011 // returned if the recv() is interrupted before _any data_ is
1012 // available. So we actually check if there is data, and if so,
1013 // push it to the user.
1014 if ((status == B_TIMED_OUT || status == B_INTERRUPTED)
1015 && fReceiveQueue.Available() > 0)
1016 break;
1017
1018 return posix_error(status);
1019 }
1020 }
1021
1022 TRACE(" ReadData(): %" B_PRIuSIZE " are available.",
1023 fReceiveQueue.Available());
1024
1025 if (numBytes < fReceiveQueue.Available())
1026 fReceiveCondition.NotifyAll();
1027
1028 bool clone = (flags & MSG_PEEK) != 0;
1029
1030 ssize_t receivedBytes = fReceiveQueue.Get(numBytes, !clone, _buffer);
1031
1032 TRACE(" ReadData(): %" B_PRIuSIZE " bytes kept.",
1033 fReceiveQueue.Available());
1034
1035 if (fReceiveQueue.Available() == 0 && fState == FINISH_RECEIVED)
1036 socket->receive.low_water_mark = 0;
1037
1038 // if we opened the window, check if we should send a window update
1039 if (!clone) {
1040 // Only send if there's less than half the window size remaining.
1041 // TODO: This should use fReceiveWindow but that stays constant at present
1042 // due to another TODO.
1043 uint32 windowSizeRemaining = (fReceiveMaxAdvertised - fReceiveNext).Number();
1044 if (fReceiveNext == (fReceiveMaxAdvertised + 1))
1045 windowSizeRemaining = 0;
1046 if (windowSizeRemaining <= (fReceiveQueue.Size() / 2))
1047 _SendAcknowledge();
1048 }
1049
1050 return receivedBytes;
1051 }
1052
1053
1054 ssize_t
ReadAvailable()1055 TCPEndpoint::ReadAvailable()
1056 {
1057 MutexLocker locker(fLock);
1058
1059 TRACE("ReadAvailable(): %" B_PRIdSSIZE, _AvailableData());
1060 T(APICall(this, "readavailable"));
1061
1062 return _AvailableData();
1063 }
1064
1065
1066 status_t
SetSendBufferSize(size_t length)1067 TCPEndpoint::SetSendBufferSize(size_t length)
1068 {
1069 MutexLocker _(fLock);
1070 fSendQueue.SetMaxBytes(length);
1071 return B_OK;
1072 }
1073
1074
1075 status_t
SetReceiveBufferSize(size_t length)1076 TCPEndpoint::SetReceiveBufferSize(size_t length)
1077 {
1078 MutexLocker _(fLock);
1079 fReceiveQueue.SetMaxBytes(length);
1080 fFlags &= ~FLAG_AUTO_RECEIVE_BUFFER_SIZE;
1081 return B_OK;
1082 }
1083
1084
1085 status_t
GetOption(int option,void * _value,int * _length)1086 TCPEndpoint::GetOption(int option, void* _value, int* _length)
1087 {
1088 if (*_length != sizeof(int))
1089 return B_BAD_VALUE;
1090
1091 int* value = (int*)_value;
1092
1093 switch (option) {
1094 case TCP_NODELAY:
1095 if ((fOptions & TCP_NODELAY) != 0)
1096 *value = 1;
1097 else
1098 *value = 0;
1099 return B_OK;
1100
1101 case TCP_MAXSEG:
1102 *value = fReceiveMaxSegmentSize;
1103 return B_OK;
1104
1105 default:
1106 return B_BAD_VALUE;
1107 }
1108 }
1109
1110
1111 status_t
SetOption(int option,const void * _value,int length)1112 TCPEndpoint::SetOption(int option, const void* _value, int length)
1113 {
1114 if (option != TCP_NODELAY)
1115 return B_BAD_VALUE;
1116
1117 if (length != sizeof(int))
1118 return B_BAD_VALUE;
1119
1120 const int* value = (const int*)_value;
1121
1122 MutexLocker _(fLock);
1123 if (*value)
1124 fOptions |= TCP_NODELAY;
1125 else
1126 fOptions &= ~TCP_NODELAY;
1127
1128 return B_OK;
1129 }
1130
1131
1132 // #pragma mark - misc
1133
1134
1135 bool
IsBound() const1136 TCPEndpoint::IsBound() const
1137 {
1138 return !LocalAddress().IsEmpty(true);
1139 }
1140
1141
1142 bool
IsLocal() const1143 TCPEndpoint::IsLocal() const
1144 {
1145 return (fFlags & FLAG_LOCAL) != 0;
1146 }
1147
1148
1149 status_t
DelayedAcknowledge()1150 TCPEndpoint::DelayedAcknowledge()
1151 {
1152 // ACKs "MUST" be generated within 500ms of the first unACKed packet, and
1153 // "SHOULD" be for at least every second full-size segment. (RFC 5681 § 4.2)
1154
1155 bigtime_t delay = TCP_DELAYED_ACKNOWLEDGE_TIMEOUT;
1156 if ((fReceiveNext - fLastAcknowledgeSent) >= (fReceiveMaxSegmentSize * 2)) {
1157 // Trigger an immediate timeout rather than invoking Send directly,
1158 // allowing multiple invocations to be coalesced.
1159 delay = 0;
1160 } else if (gStackModule->is_timer_active(&fDelayedAcknowledgeTimer)) {
1161 return B_OK;
1162 }
1163
1164 gStackModule->set_timer(&fDelayedAcknowledgeTimer, delay);
1165 T(TimerSet(this, "delayed ack", TCP_DELAYED_ACKNOWLEDGE_TIMEOUT));
1166 return B_OK;
1167 }
1168
1169
1170 void
_StartPersistTimer()1171 TCPEndpoint::_StartPersistTimer()
1172 {
1173 gStackModule->set_timer(&fPersistTimer, TCP_PERSIST_TIMEOUT);
1174 T(TimerSet(this, "persist", TCP_PERSIST_TIMEOUT));
1175 }
1176
1177
1178 void
_EnterTimeWait()1179 TCPEndpoint::_EnterTimeWait()
1180 {
1181 TRACE("_EnterTimeWait()");
1182
1183 if (fState == TIME_WAIT)
1184 _CancelConnectionTimers();
1185
1186 _UpdateTimeWait();
1187 }
1188
1189
1190 void
_UpdateTimeWait()1191 TCPEndpoint::_UpdateTimeWait()
1192 {
1193 gStackModule->set_timer(&fTimeWaitTimer, TCP_MAX_SEGMENT_LIFETIME << 1);
1194 T(TimerSet(this, "time-wait", TCP_MAX_SEGMENT_LIFETIME << 1));
1195 }
1196
1197
1198 void
_CancelConnectionTimers()1199 TCPEndpoint::_CancelConnectionTimers()
1200 {
1201 gStackModule->cancel_timer(&fRetransmitTimer);
1202 T(TimerSet(this, "retransmit", -1));
1203 gStackModule->cancel_timer(&fPersistTimer);
1204 T(TimerSet(this, "persist", -1));
1205 gStackModule->cancel_timer(&fDelayedAcknowledgeTimer);
1206 T(TimerSet(this, "delayed ack", -1));
1207 }
1208
1209
1210 /*! Sends the FIN flag to the peer when the connection is still open.
1211 Moves the endpoint to the next state depending on where it was.
1212 */
1213 status_t
_Disconnect(bool closing)1214 TCPEndpoint::_Disconnect(bool closing)
1215 {
1216 tcp_state previousState = fState;
1217
1218 if (fState == SYNCHRONIZE_RECEIVED || fState == ESTABLISHED)
1219 fState = FINISH_SENT;
1220 else if (fState == FINISH_RECEIVED)
1221 fState = WAIT_FOR_FINISH_ACKNOWLEDGE;
1222 else
1223 return B_OK;
1224
1225 T(State(this));
1226
1227 status_t status = _SendQueued();
1228 if (status != B_OK) {
1229 fState = previousState;
1230 T(State(this));
1231 return status;
1232 }
1233
1234 return B_OK;
1235 }
1236
1237
1238 void
_MarkEstablished()1239 TCPEndpoint::_MarkEstablished()
1240 {
1241 fState = ESTABLISHED;
1242 T(State(this));
1243
1244 gSocketModule->set_connected(socket);
1245 if (gSocketModule->has_parent(socket))
1246 release_sem_etc(fAcceptSemaphore, 1, B_DO_NOT_RESCHEDULE);
1247
1248 fSendCondition.NotifyAll();
1249 gSocketModule->notify(socket, B_SELECT_WRITE, fSendQueue.Free());
1250 }
1251
1252
1253 status_t
_WaitForEstablished(MutexLocker & locker,bigtime_t timeout)1254 TCPEndpoint::_WaitForEstablished(MutexLocker &locker, bigtime_t timeout)
1255 {
1256 // TODO: Checking for CLOSED seems correct, but breaks several neon tests.
1257 // When investigating this, also have a look at _Close() and _HandleReset().
1258 while (fState < ESTABLISHED/* && fState != CLOSED*/) {
1259 if (socket->error != B_OK)
1260 return socket->error;
1261
1262 status_t status = _WaitForCondition(fSendCondition, locker, timeout);
1263 if (status < B_OK)
1264 return status;
1265 }
1266
1267 return B_OK;
1268 }
1269
1270
1271 // #pragma mark - receive
1272
1273
1274 void
_Close()1275 TCPEndpoint::_Close()
1276 {
1277 _CancelConnectionTimers();
1278 fState = CLOSED;
1279 T(State(this));
1280
1281 fFlags |= FLAG_DELETE_ON_CLOSE;
1282
1283 fSendCondition.NotifyAll();
1284 _NotifyReader();
1285
1286 if (gSocketModule->has_parent(socket)) {
1287 // We still have a parent - obviously, we haven't been accepted yet,
1288 // so no one could ever close us.
1289 _CancelConnectionTimers();
1290 gSocketModule->set_aborted(socket);
1291 }
1292 }
1293
1294
1295 void
_HandleReset(status_t error)1296 TCPEndpoint::_HandleReset(status_t error)
1297 {
1298 socket->error = error;
1299 _Close();
1300
1301 gSocketModule->notify(socket, B_SELECT_WRITE, error);
1302 gSocketModule->notify(socket, B_SELECT_ERROR, error);
1303 }
1304
1305
1306 void
_DuplicateAcknowledge(tcp_segment_header & segment)1307 TCPEndpoint::_DuplicateAcknowledge(tcp_segment_header &segment)
1308 {
1309 if (fDuplicateAcknowledgeCount == 0)
1310 fPreviousFlightSize = (fSendMax - fSendUnacknowledged).Number();
1311
1312 if (++fDuplicateAcknowledgeCount < 3) {
1313 if (fSendQueue.Available(fSendMax) != 0 && fSendWindow != 0) {
1314 fSendNext = fSendMax;
1315 fCongestionWindow += fDuplicateAcknowledgeCount * fSendMaxSegmentSize;
1316 _SendQueued();
1317 TRACE("_DuplicateAcknowledge(): packet sent under limited transmit on receipt of dup ack");
1318 fCongestionWindow -= fDuplicateAcknowledgeCount * fSendMaxSegmentSize;
1319 }
1320 }
1321
1322 if (fDuplicateAcknowledgeCount == 3) {
1323 if ((segment.acknowledge - 1) > fRecover || (fCongestionWindow > fSendMaxSegmentSize &&
1324 (fSendUnacknowledged - fPreviousHighestAcknowledge) <= 4 * fSendMaxSegmentSize)) {
1325 fFlags |= FLAG_RECOVERY;
1326 fRecover = fSendMax.Number() - 1;
1327 fSlowStartThreshold = max_c(fPreviousFlightSize / 2, 2 * fSendMaxSegmentSize);
1328 fCongestionWindow = fSlowStartThreshold + 3 * fSendMaxSegmentSize;
1329 fSendNext = segment.acknowledge;
1330 _SendQueued();
1331 TRACE("_DuplicateAcknowledge(): packet sent under fast restransmit on the receipt of 3rd dup ack");
1332 }
1333 } else if (fDuplicateAcknowledgeCount > 3) {
1334 uint32 flightSize = (fSendMax - fSendUnacknowledged).Number();
1335 if ((fDuplicateAcknowledgeCount - 3) * fSendMaxSegmentSize <= flightSize)
1336 fCongestionWindow += fSendMaxSegmentSize;
1337 if (fSendQueue.Available(fSendMax) != 0) {
1338 fSendNext = fSendMax;
1339 _SendQueued();
1340 }
1341 }
1342 }
1343
1344
1345 void
_UpdateTimestamps(tcp_segment_header & segment,size_t segmentLength)1346 TCPEndpoint::_UpdateTimestamps(tcp_segment_header& segment,
1347 size_t segmentLength)
1348 {
1349 if (fFlags & FLAG_OPTION_TIMESTAMP) {
1350 tcp_sequence sequence(segment.sequence);
1351
1352 if (fLastAcknowledgeSent >= sequence
1353 && fLastAcknowledgeSent < (sequence + segmentLength))
1354 fReceivedTimestamp = segment.timestamp_value;
1355 }
1356 }
1357
1358
1359 void
_UpdateReceiveBuffer()1360 TCPEndpoint::_UpdateReceiveBuffer()
1361 {
1362 if (fReceiveWindowShift == 0 || fSmoothedRoundTripTime < 0)
1363 return;
1364 if ((fFlags & FLAG_AUTO_RECEIVE_BUFFER_SIZE) == 0)
1365 return;
1366
1367 const uint64 maxWindowSize = UINT16_MAX << fReceiveWindowShift;
1368 if (fReceiveQueue.Size() == maxWindowSize) {
1369 // The window is already as large as it could be.
1370 return;
1371 }
1372
1373 if (fReceiveSizingTimestamp == 0) {
1374 fReceiveSizingTimestamp = tcp_now();
1375 fReceiveSizingReference = fReceiveNext;
1376 return;
1377 }
1378
1379 const uint32 received = (fReceiveNext - fReceiveSizingReference).Number();
1380 if (received < (fReceiveQueue.Size() / 2))
1381 return;
1382
1383 const uint32 since = tcp_diff_timestamp(fReceiveSizingTimestamp);
1384 if (since == 0 || since < ((uint32)fSmoothedRoundTripTime * 2))
1385 return;
1386
1387 fReceiveSizingTimestamp = tcp_now();
1388 fReceiveSizingReference = fReceiveNext;
1389
1390 // TODO: add low_resource hook to reduce window sizes.
1391 if (low_resource_state(B_KERNEL_RESOURCE_MEMORY) != B_NO_LOW_RESOURCE)
1392 return;
1393
1394 // Target a window large enough to fit one second of traffic.
1395 const uint64 oneSecondReceive = (received * 1000LL) / since;
1396 if (fReceiveQueue.Size() >= oneSecondReceive)
1397 return;
1398
1399 // Don't increase the window size too rapidly.
1400 uint32 newWindowSize = min_c(oneSecondReceive, UINT32_MAX);
1401 if (newWindowSize > (fReceiveQueue.Size() * 2))
1402 newWindowSize = fReceiveQueue.Size() * 2;
1403
1404 // Round to the window shift and max segment size.
1405 uint32 newWindowShifted = newWindowSize >> fReceiveWindowShift;
1406 const uint32 maxSegmentShifted = fReceiveMaxSegmentSize >> fReceiveWindowShift;
1407 newWindowShifted = (newWindowShifted / maxSegmentShifted) * maxSegmentShifted;
1408 newWindowSize = newWindowShifted << fReceiveWindowShift;
1409
1410 if (newWindowSize > maxWindowSize)
1411 newWindowSize = maxWindowSize;
1412 if (fReceiveQueue.Size() >= newWindowSize)
1413 return;
1414
1415 fReceiveQueue.SetMaxBytes(newWindowSize);
1416 TRACE("TCPEndpoint: updated receive buffer size to %" B_PRIu32 "\n", newWindowSize);
1417 }
1418
1419
1420 ssize_t
_AvailableData() const1421 TCPEndpoint::_AvailableData() const
1422 {
1423 // TODO: Refer to the FLAG_NO_RECEIVE comment above regarding
1424 // the application of FLAG_NO_RECEIVE in listen()ing
1425 // sockets.
1426 if (fState == LISTEN)
1427 return gSocketModule->count_connected(socket);
1428 if (fState == SYNCHRONIZE_SENT)
1429 return 0;
1430
1431 ssize_t availableData = fReceiveQueue.Available();
1432
1433 if (availableData == 0 && !_ShouldReceive())
1434 return ENOTCONN;
1435 if (availableData == 0 && (fState == FINISH_RECEIVED || fState == WAIT_FOR_FINISH_ACKNOWLEDGE))
1436 return ESHUTDOWN;
1437 return availableData;
1438 }
1439
1440
1441 void
_NotifyReader()1442 TCPEndpoint::_NotifyReader()
1443 {
1444 fReceiveCondition.NotifyAll();
1445 gSocketModule->notify(socket, B_SELECT_READ, _AvailableData());
1446 }
1447
1448
1449 bool
_AddData(tcp_segment_header & segment,net_buffer * buffer)1450 TCPEndpoint::_AddData(tcp_segment_header& segment, net_buffer* buffer)
1451 {
1452 if ((segment.flags & TCP_FLAG_FINISH) != 0) {
1453 // Remember the position of the finish received flag
1454 fFinishReceived = true;
1455 fFinishReceivedAt = segment.sequence + buffer->size;
1456 }
1457
1458 fReceiveQueue.Add(buffer, segment.sequence);
1459 fReceiveNext = fReceiveQueue.NextSequence();
1460
1461 if (fFinishReceived) {
1462 // Set or reset the finish flag on the current segment
1463 if (fReceiveNext < fFinishReceivedAt)
1464 segment.flags &= ~TCP_FLAG_FINISH;
1465 else
1466 segment.flags |= TCP_FLAG_FINISH;
1467 }
1468
1469 TRACE(" _AddData(): adding data, receive next = %" B_PRIu32 ". Now have %"
1470 B_PRIuSIZE " bytes.", fReceiveNext.Number(), fReceiveQueue.Available());
1471
1472 _UpdateReceiveBuffer();
1473
1474 if ((segment.flags & TCP_FLAG_PUSH) != 0)
1475 fReceiveQueue.SetPushPointer();
1476
1477 return fReceiveQueue.Available() > 0;
1478 }
1479
1480
1481 void
_PrepareReceivePath(tcp_segment_header & segment)1482 TCPEndpoint::_PrepareReceivePath(tcp_segment_header& segment)
1483 {
1484 fInitialReceiveSequence = segment.sequence;
1485 fFinishReceived = false;
1486 fReceiveSizingTimestamp = 0;
1487
1488 // count the received SYN
1489 segment.sequence++;
1490
1491 fReceiveNext = segment.sequence;
1492 fReceiveQueue.SetInitialSequence(segment.sequence);
1493
1494 if ((fOptions & TCP_NOOPT) == 0) {
1495 if (segment.max_segment_size > 0) {
1496 // The maximum size of a segment that a TCP endpoint really sends,
1497 // the "effective send MSS", MUST be the smaller of the send MSS and
1498 // the largest transmission size permitted by the IP layer:
1499 fSendMaxSegmentSize = min_c(segment.max_segment_size,
1500 _MaxSegmentSize(*PeerAddress()));
1501 }
1502
1503 if (segment.options & TCP_HAS_WINDOW_SCALE) {
1504 fFlags |= FLAG_OPTION_WINDOW_SCALE;
1505 fSendWindowShift = segment.window_shift;
1506 } else {
1507 fFlags &= ~FLAG_OPTION_WINDOW_SCALE;
1508 fReceiveWindowShift = 0;
1509 }
1510
1511 if (segment.options & TCP_HAS_TIMESTAMPS) {
1512 fFlags |= FLAG_OPTION_TIMESTAMP;
1513 fReceivedTimestamp = segment.timestamp_value;
1514 } else
1515 fFlags &= ~FLAG_OPTION_TIMESTAMP;
1516
1517 if ((segment.options & TCP_SACK_PERMITTED) == 0) {
1518 fFlags &= ~FLAG_OPTION_SACK_PERMITTED;
1519 fFlags &= ~FLAG_AUTO_RECEIVE_BUFFER_SIZE;
1520 }
1521 }
1522
1523 if (fSendMaxSegmentSize > 2190)
1524 fCongestionWindow = 2 * fSendMaxSegmentSize;
1525 else if (fSendMaxSegmentSize > 1095)
1526 fCongestionWindow = 3 * fSendMaxSegmentSize;
1527 else
1528 fCongestionWindow = 4 * fSendMaxSegmentSize;
1529
1530 fSendMaxSegments = fCongestionWindow / fSendMaxSegmentSize;
1531 fSlowStartThreshold = (uint32)segment.advertised_window << fSendWindowShift;
1532 }
1533
1534
1535 bool
_ShouldReceive() const1536 TCPEndpoint::_ShouldReceive() const
1537 {
1538 if ((fFlags & FLAG_NO_RECEIVE) != 0)
1539 return false;
1540
1541 return fState == ESTABLISHED || fState == FINISH_SENT
1542 || fState == FINISH_ACKNOWLEDGED || fState == FINISH_RECEIVED;
1543 }
1544
1545
1546 int32
_Spawn(TCPEndpoint * parent,tcp_segment_header & segment,net_buffer * buffer)1547 TCPEndpoint::_Spawn(TCPEndpoint* parent, tcp_segment_header& segment,
1548 net_buffer* buffer)
1549 {
1550 MutexLocker _(fLock);
1551
1552 TRACE("Spawn()");
1553
1554 // TODO: proper error handling!
1555 if (ProtocolSocket::Open() != B_OK) {
1556 T(Error(this, "opening failed", __LINE__));
1557 return DROP;
1558 }
1559
1560 fState = SYNCHRONIZE_RECEIVED;
1561 T(Spawn(parent, this));
1562
1563 fManager = parent->fManager;
1564
1565 if (fManager->BindChild(this, buffer->destination) != B_OK) {
1566 T(Error(this, "binding failed", __LINE__));
1567 return DROP;
1568 }
1569 if (_PrepareSendPath(buffer->source) != B_OK) {
1570 T(Error(this, "prepare send faild", __LINE__));
1571 return DROP;
1572 }
1573
1574 fOptions = parent->fOptions;
1575 fAcceptSemaphore = parent->fAcceptSemaphore;
1576
1577 _PrepareReceivePath(segment);
1578
1579 // send SYN+ACK
1580 if (_SendAcknowledge() != B_OK) {
1581 T(Error(this, "sending failed", __LINE__));
1582 return DROP;
1583 }
1584
1585 segment.flags &= ~TCP_FLAG_SYNCHRONIZE;
1586 // we handled this flag now, it must not be set for further processing
1587
1588 return _Receive(segment, buffer);
1589 }
1590
1591
1592 int32
_ListenReceive(tcp_segment_header & segment,net_buffer * buffer)1593 TCPEndpoint::_ListenReceive(tcp_segment_header& segment, net_buffer* buffer)
1594 {
1595 TRACE("ListenReceive()");
1596
1597 // Essentially, we accept only TCP_FLAG_SYNCHRONIZE in this state,
1598 // but the error behaviour differs
1599 if (segment.flags & TCP_FLAG_RESET)
1600 return DROP;
1601 if (segment.flags & TCP_FLAG_ACKNOWLEDGE)
1602 return DROP | RESET;
1603 if ((segment.flags & TCP_FLAG_SYNCHRONIZE) == 0)
1604 return DROP;
1605
1606 // TODO: drop broadcast/multicast
1607
1608 // spawn new endpoint for accept()
1609 net_socket* newSocket;
1610 if (gSocketModule->spawn_pending_socket(socket, &newSocket) < B_OK) {
1611 T(Error(this, "spawning failed", __LINE__));
1612 return DROP;
1613 }
1614
1615 return ((TCPEndpoint *)newSocket->first_protocol)->_Spawn(this,
1616 segment, buffer);
1617 }
1618
1619
1620 int32
_SynchronizeSentReceive(tcp_segment_header & segment,net_buffer * buffer)1621 TCPEndpoint::_SynchronizeSentReceive(tcp_segment_header &segment,
1622 net_buffer *buffer)
1623 {
1624 TRACE("_SynchronizeSentReceive()");
1625
1626 if ((segment.flags & TCP_FLAG_ACKNOWLEDGE) != 0
1627 && (fInitialSendSequence >= segment.acknowledge
1628 || fSendMax < segment.acknowledge))
1629 return DROP | RESET;
1630
1631 if (segment.flags & TCP_FLAG_RESET) {
1632 _HandleReset(ECONNREFUSED);
1633 return DROP;
1634 }
1635
1636 if ((segment.flags & TCP_FLAG_SYNCHRONIZE) == 0)
1637 return DROP;
1638
1639 fSendUnacknowledged = segment.acknowledge;
1640 _PrepareReceivePath(segment);
1641
1642 if (segment.flags & TCP_FLAG_ACKNOWLEDGE) {
1643 _MarkEstablished();
1644 } else {
1645 // simultaneous open
1646 fState = SYNCHRONIZE_RECEIVED;
1647 T(State(this));
1648 }
1649
1650 segment.flags &= ~TCP_FLAG_SYNCHRONIZE;
1651 // we handled this flag now, it must not be set for further processing
1652
1653 return _Receive(segment, buffer) | IMMEDIATE_ACKNOWLEDGE;
1654 }
1655
1656
1657 int32
_Receive(tcp_segment_header & segment,net_buffer * buffer)1658 TCPEndpoint::_Receive(tcp_segment_header& segment, net_buffer* buffer)
1659 {
1660 // PAWS processing takes precedence over regular TCP acceptability check
1661 if ((fFlags & FLAG_OPTION_TIMESTAMP) != 0 && (segment.flags & TCP_FLAG_RESET) == 0) {
1662 if ((segment.options & TCP_HAS_TIMESTAMPS) == 0)
1663 return DROP;
1664 if ((int32)(fReceivedTimestamp - segment.timestamp_value) > 0
1665 && (fReceivedTimestamp - segment.timestamp_value) <= INT32_MAX)
1666 return DROP | IMMEDIATE_ACKNOWLEDGE;
1667 }
1668
1669 uint32 advertisedWindow = segment.AdvertisedWindow(fSendWindowShift);
1670 size_t segmentLength = buffer->size;
1671
1672 // First, handle the most common case for uni-directional data transfer
1673 // (known as header prediction - the segment must not change the window,
1674 // and must be the expected sequence, and contain no control flags)
1675
1676 if (fState == ESTABLISHED
1677 && segment.AcknowledgeOnly()
1678 && fReceiveNext == segment.sequence
1679 && advertisedWindow > 0 && advertisedWindow == fSendWindow
1680 && fSendNext == fSendMax) {
1681 _UpdateTimestamps(segment, segmentLength);
1682
1683 if (segmentLength == 0) {
1684 // this is a pure acknowledge segment - we're on the sending end
1685 if (fSendUnacknowledged < segment.acknowledge
1686 && fSendMax >= segment.acknowledge) {
1687 _Acknowledged(segment);
1688 return DROP;
1689 }
1690 } else if (segment.acknowledge == fSendUnacknowledged
1691 && fReceiveQueue.IsContiguous()
1692 && fReceiveQueue.Free() >= segmentLength
1693 && (fFlags & FLAG_NO_RECEIVE) == 0) {
1694 if (_AddData(segment, buffer))
1695 _NotifyReader();
1696
1697 return KEEP | ((segment.flags & TCP_FLAG_PUSH) != 0
1698 ? IMMEDIATE_ACKNOWLEDGE : ACKNOWLEDGE);
1699 }
1700 }
1701
1702 // The fast path was not applicable, so we continue with the standard
1703 // processing of the incoming segment
1704
1705 ASSERT(fState != SYNCHRONIZE_SENT && fState != LISTEN);
1706
1707 if (fState != CLOSED && fState != TIME_WAIT) {
1708 // Check sequence number
1709 if (!segment_in_sequence(segment, segmentLength, fReceiveNext,
1710 fReceiveWindow)) {
1711 TRACE(" Receive(): segment out of window, next: %" B_PRIu32
1712 " wnd: %" B_PRIu32, fReceiveNext.Number(), fReceiveWindow);
1713 if ((segment.flags & TCP_FLAG_RESET) != 0) {
1714 // TODO: this doesn't look right - review!
1715 return DROP;
1716 }
1717 return DROP | IMMEDIATE_ACKNOWLEDGE;
1718 }
1719 }
1720
1721 if ((segment.flags & TCP_FLAG_RESET) != 0) {
1722 // Is this a valid reset?
1723 // We generally ignore resets in time wait state (see RFC 1337)
1724 if (fLastAcknowledgeSent <= segment.sequence
1725 && tcp_sequence(segment.sequence) < (fLastAcknowledgeSent
1726 + fReceiveWindow)
1727 && fState != TIME_WAIT) {
1728 status_t error;
1729 if (fState == SYNCHRONIZE_RECEIVED)
1730 error = ECONNREFUSED;
1731 else if (fState == CLOSING || fState == WAIT_FOR_FINISH_ACKNOWLEDGE)
1732 error = ENOTCONN;
1733 else
1734 error = ECONNRESET;
1735
1736 _HandleReset(error);
1737 }
1738
1739 return DROP;
1740 }
1741
1742 if ((segment.flags & TCP_FLAG_SYNCHRONIZE) != 0
1743 || (fState == SYNCHRONIZE_RECEIVED
1744 && (fInitialReceiveSequence > segment.sequence
1745 || ((segment.flags & TCP_FLAG_ACKNOWLEDGE) != 0
1746 && (fSendUnacknowledged > segment.acknowledge
1747 || fSendMax < segment.acknowledge))))) {
1748 // reset the connection - either the initial SYN was faulty, or we
1749 // received a SYN within the data stream
1750 return DROP | RESET;
1751 }
1752
1753 // TODO: Check this! Why do we advertize a window outside of what we should
1754 // buffer?
1755 fReceiveWindow = max_c(fReceiveQueue.Free(), fReceiveWindow);
1756 // the window must not shrink
1757
1758 // trim buffer to be within the receive window
1759 int32 drop = (int32)(fReceiveNext - segment.sequence).Number();
1760 if (drop > 0) {
1761 if ((uint32)drop > buffer->size
1762 || ((uint32)drop == buffer->size
1763 && (segment.flags & TCP_FLAG_FINISH) == 0)) {
1764 // don't accidently remove a FIN we shouldn't remove
1765 segment.flags &= ~TCP_FLAG_FINISH;
1766 drop = buffer->size;
1767 }
1768
1769 // remove duplicate data at the start
1770 TRACE("* remove %" B_PRId32 " bytes from the start", drop);
1771 gBufferModule->remove_header(buffer, drop);
1772 segment.sequence += drop;
1773 }
1774
1775 int32 action = KEEP;
1776
1777 // Immediately acknowledge out-of-order segments, as well as any
1778 // in-order segments following out-of-order segments, to trigger
1779 // fast-retransmit and fast-recovery at the sender.
1780 if (drop != 0 || (drop == 0 && !fReceiveQueue.IsContiguous()))
1781 action |= IMMEDIATE_ACKNOWLEDGE;
1782
1783 drop = (int32)(segment.sequence + buffer->size
1784 - (fReceiveNext + fReceiveWindow)).Number();
1785 if (drop > 0) {
1786 // remove data exceeding our window
1787 if ((uint32)drop >= buffer->size) {
1788 // if we can accept data, or the segment is not what we'd expect,
1789 // drop the segment (an immediate acknowledge is always triggered)
1790 if (fReceiveWindow != 0 || segment.sequence != fReceiveNext)
1791 return DROP | IMMEDIATE_ACKNOWLEDGE;
1792
1793 action |= IMMEDIATE_ACKNOWLEDGE;
1794 }
1795
1796 if ((segment.flags & TCP_FLAG_FINISH) != 0) {
1797 // we need to remove the finish, too, as part of the data
1798 drop--;
1799 }
1800
1801 segment.flags &= ~(TCP_FLAG_FINISH | TCP_FLAG_PUSH);
1802 TRACE("* remove %" B_PRId32 " bytes from the end", drop);
1803 gBufferModule->remove_trailer(buffer, drop);
1804 }
1805
1806 #ifdef TRACE_TCP
1807 if (advertisedWindow > fSendWindow) {
1808 TRACE(" Receive(): Window update %" B_PRIu32 " -> %" B_PRIu32,
1809 fSendWindow, advertisedWindow);
1810 }
1811 #endif
1812
1813 if (fSendWindow < fSendMaxSegmentSize
1814 && (advertisedWindow >= fSendMaxSegmentSize
1815 || advertisedWindow > (TCP_DEFAULT_MAX_SEGMENT_SIZE * 3))) {
1816 // Our current send window is less than a segment wide, and the new one
1817 // is larger, so trigger a send in case there's anything to be sent.
1818 action |= SEND_QUEUED;
1819 }
1820
1821 fSendWindow = advertisedWindow;
1822 if (advertisedWindow > fSendMaxWindow)
1823 fSendMaxWindow = advertisedWindow;
1824
1825 // Then look at the acknowledgement for any updates
1826
1827 if ((segment.flags & TCP_FLAG_ACKNOWLEDGE) != 0) {
1828 // process acknowledged data
1829 if (fState == SYNCHRONIZE_RECEIVED)
1830 _MarkEstablished();
1831
1832 if (fSendMax < segment.acknowledge)
1833 return DROP | IMMEDIATE_ACKNOWLEDGE;
1834
1835 if (segment.acknowledge == fSendUnacknowledged) {
1836 if (buffer->size == 0 && advertisedWindow == fSendWindow
1837 && (segment.flags & TCP_FLAG_FINISH) == 0 && fSendUnacknowledged != fSendMax) {
1838 TRACE("Receive(): duplicate ack!");
1839 _DuplicateAcknowledge(segment);
1840 }
1841 } else if (segment.acknowledge < fSendUnacknowledged) {
1842 return DROP;
1843 } else {
1844 // this segment acknowledges in flight data
1845
1846 if (fDuplicateAcknowledgeCount >= 3) {
1847 // deflate the window.
1848 if (segment.acknowledge > fRecover) {
1849 uint32 flightSize = (fSendMax - fSendUnacknowledged).Number();
1850 fCongestionWindow = min_c(fSlowStartThreshold,
1851 max_c(flightSize, fSendMaxSegmentSize) + fSendMaxSegmentSize);
1852 fFlags &= ~FLAG_RECOVERY;
1853 }
1854 }
1855
1856 if (fSendMax == segment.acknowledge)
1857 TRACE("Receive(): all inflight data ack'd!");
1858
1859 if (segment.acknowledge > fSendQueue.LastSequence()
1860 && fState > ESTABLISHED) {
1861 TRACE("Receive(): FIN has been acknowledged!");
1862
1863 switch (fState) {
1864 case FINISH_SENT:
1865 fState = FINISH_ACKNOWLEDGED;
1866 T(State(this));
1867 break;
1868 case CLOSING:
1869 fState = TIME_WAIT;
1870 T(State(this));
1871 _EnterTimeWait();
1872 return DROP;
1873 case WAIT_FOR_FINISH_ACKNOWLEDGE:
1874 _Close();
1875 break;
1876
1877 default:
1878 break;
1879 }
1880 }
1881
1882 if (fState != CLOSED) {
1883 tcp_sequence last = fLastAcknowledgeSent;
1884 _Acknowledged(segment);
1885 // we just sent an acknowledge, remove from action
1886 if (last < fLastAcknowledgeSent)
1887 action &= ~IMMEDIATE_ACKNOWLEDGE;
1888 }
1889 }
1890 }
1891
1892 if (segment.flags & TCP_FLAG_URGENT) {
1893 if (fState == ESTABLISHED || fState == FINISH_SENT
1894 || fState == FINISH_ACKNOWLEDGED) {
1895 // TODO: Handle urgent data:
1896 // - RCV.UP <- max(RCV.UP, SEG.UP)
1897 // - signal the user that urgent data is available (SIGURG)
1898 }
1899 }
1900
1901 bool notify = false;
1902
1903 // The buffer may be freed if its data is added to the queue, so cache
1904 // the size as we still need it later.
1905 const uint32 bufferSize = buffer->size;
1906
1907 if ((bufferSize > 0 || (segment.flags & TCP_FLAG_FINISH) != 0)
1908 && _ShouldReceive())
1909 notify = _AddData(segment, buffer);
1910 else {
1911 if ((fFlags & FLAG_NO_RECEIVE) != 0)
1912 fReceiveNext += buffer->size;
1913
1914 action = (action & ~KEEP) | DROP;
1915 }
1916
1917 if ((segment.flags & TCP_FLAG_FINISH) != 0) {
1918 segmentLength++;
1919 if (fState != CLOSED && fState != LISTEN && fState != SYNCHRONIZE_SENT) {
1920 TRACE("Receive(): peer is finishing connection!");
1921 fReceiveNext++;
1922 notify = true;
1923
1924 // FIN implies PUSH
1925 fReceiveQueue.SetPushPointer();
1926
1927 // we'll reply immediately to the FIN if we are not
1928 // transitioning to TIME WAIT so we immediatly ACK it.
1929 action |= IMMEDIATE_ACKNOWLEDGE;
1930
1931 // other side is closing connection; change states
1932 switch (fState) {
1933 case ESTABLISHED:
1934 case SYNCHRONIZE_RECEIVED:
1935 fState = FINISH_RECEIVED;
1936 T(State(this));
1937 break;
1938 case FINISH_SENT:
1939 // simultaneous close
1940 fState = CLOSING;
1941 T(State(this));
1942 break;
1943 case FINISH_ACKNOWLEDGED:
1944 fState = TIME_WAIT;
1945 T(State(this));
1946 _EnterTimeWait();
1947 break;
1948 case TIME_WAIT:
1949 _UpdateTimeWait();
1950 break;
1951
1952 default:
1953 break;
1954 }
1955 }
1956 }
1957
1958 if (notify)
1959 _NotifyReader();
1960
1961 if (bufferSize > 0 || (segment.flags & TCP_FLAG_SYNCHRONIZE) != 0)
1962 action |= ACKNOWLEDGE;
1963
1964 _UpdateTimestamps(segment, segmentLength);
1965
1966 TRACE("Receive() Action %" B_PRId32, action);
1967
1968 return action;
1969 }
1970
1971
1972 int32
SegmentReceived(tcp_segment_header & segment,net_buffer * buffer)1973 TCPEndpoint::SegmentReceived(tcp_segment_header& segment, net_buffer* buffer)
1974 {
1975 MutexLocker locker(fLock);
1976
1977 TRACE("SegmentReceived(): buffer %p (%" B_PRIu32 " bytes) address %s "
1978 "to %s flags %#" B_PRIx8 ", seq %" B_PRIu32 ", ack %" B_PRIu32
1979 ", wnd %" B_PRIu32, buffer, buffer->size, PrintAddress(buffer->source),
1980 PrintAddress(buffer->destination), segment.flags, segment.sequence,
1981 segment.acknowledge,
1982 (uint32)segment.advertised_window << fSendWindowShift);
1983 T(Receive(this, segment,
1984 (uint32)segment.advertised_window << fSendWindowShift, buffer));
1985 int32 segmentAction = DROP;
1986
1987 switch (fState) {
1988 case LISTEN:
1989 segmentAction = _ListenReceive(segment, buffer);
1990 break;
1991
1992 case SYNCHRONIZE_SENT:
1993 segmentAction = _SynchronizeSentReceive(segment, buffer);
1994 break;
1995
1996 case SYNCHRONIZE_RECEIVED:
1997 case ESTABLISHED:
1998 case FINISH_RECEIVED:
1999 case WAIT_FOR_FINISH_ACKNOWLEDGE:
2000 case FINISH_SENT:
2001 case FINISH_ACKNOWLEDGED:
2002 case CLOSING:
2003 case TIME_WAIT:
2004 case CLOSED:
2005 segmentAction = _Receive(segment, buffer);
2006 break;
2007 }
2008
2009 // process acknowledge action as asked for by the *Receive() method
2010 if (segmentAction & IMMEDIATE_ACKNOWLEDGE)
2011 _SendAcknowledge(true);
2012 else if (segmentAction & ACKNOWLEDGE)
2013 DelayedAcknowledge();
2014
2015 if (segmentAction & SEND_QUEUED)
2016 _SendQueued();
2017
2018 if ((fFlags & (FLAG_CLOSED | FLAG_DELETE_ON_CLOSE))
2019 == (FLAG_CLOSED | FLAG_DELETE_ON_CLOSE)) {
2020
2021 locker.Unlock();
2022 if (gSocketModule->release_socket(socket))
2023 segmentAction |= DELETED_ENDPOINT;
2024 }
2025
2026 return segmentAction;
2027 }
2028
2029
2030 // #pragma mark - send
2031
2032
2033 tcp_segment_header
_PrepareSendSegment()2034 TCPEndpoint::_PrepareSendSegment()
2035 {
2036 // we don't set FLAG_FINISH here, instead we do it
2037 // conditionally below depending if we are sending
2038 // the last bytes of the send queue.
2039
2040 uint8 flags = 0;
2041 switch (fState) {
2042 case CLOSED:
2043 flags = TCP_FLAG_RESET | TCP_FLAG_ACKNOWLEDGE;
2044 break;
2045
2046 case SYNCHRONIZE_SENT:
2047 flags = TCP_FLAG_SYNCHRONIZE;
2048 break;
2049
2050 case SYNCHRONIZE_RECEIVED:
2051 flags = TCP_FLAG_SYNCHRONIZE | TCP_FLAG_ACKNOWLEDGE;
2052 break;
2053
2054 case ESTABLISHED:
2055 case FINISH_RECEIVED:
2056 case FINISH_ACKNOWLEDGED:
2057 case TIME_WAIT:
2058 case WAIT_FOR_FINISH_ACKNOWLEDGE:
2059 case FINISH_SENT:
2060 case CLOSING:
2061 flags = TCP_FLAG_ACKNOWLEDGE;
2062
2063 default:
2064 break;
2065 }
2066
2067 tcp_segment_header segment(flags);
2068
2069 if ((fOptions & TCP_NOOPT) == 0) {
2070 if ((fFlags & FLAG_OPTION_TIMESTAMP) != 0) {
2071 segment.options |= TCP_HAS_TIMESTAMPS;
2072 segment.timestamp_reply = fReceivedTimestamp;
2073 segment.timestamp_value = tcp_now();
2074 }
2075
2076 if ((segment.flags & TCP_FLAG_SYNCHRONIZE) != 0
2077 && fSendNext == fInitialSendSequence) {
2078 // add connection establishment options
2079 segment.max_segment_size = fReceiveMaxSegmentSize;
2080 if (fFlags & FLAG_OPTION_WINDOW_SCALE) {
2081 segment.options |= TCP_HAS_WINDOW_SCALE;
2082 segment.window_shift = fReceiveWindowShift;
2083 }
2084 if ((fFlags & FLAG_OPTION_SACK_PERMITTED) != 0)
2085 segment.options |= TCP_SACK_PERMITTED;
2086 }
2087
2088 if (!fReceiveQueue.IsContiguous()
2089 && (fFlags & FLAG_OPTION_SACK_PERMITTED) != 0) {
2090 segment.options |= TCP_HAS_SACK;
2091 int maxSackCount = MAX_SACK_BLKS
2092 - (((fFlags & FLAG_OPTION_TIMESTAMP) != 0) ? 1 : 0);
2093 memset(segment.sacks, 0, sizeof(segment.sacks));
2094 segment.sackCount = fReceiveQueue.PopulateSackInfo(fReceiveNext,
2095 maxSackCount, segment.sacks);
2096 }
2097 }
2098
2099 size_t availableBytes = fReceiveQueue.Free();
2100 // window size must remain same for duplicate acknowledgements
2101 if (!fReceiveQueue.IsContiguous())
2102 availableBytes = (fReceiveMaxAdvertised - fReceiveNext).Number();
2103 segment.SetAdvertisedWindow(availableBytes, fReceiveWindowShift);
2104
2105 segment.acknowledge = fReceiveNext.Number();
2106
2107 // Process urgent data
2108 if (fSendUrgentOffset > fSendNext) {
2109 segment.flags |= TCP_FLAG_URGENT;
2110 segment.urgent_offset = (fSendUrgentOffset - fSendNext).Number();
2111 } else {
2112 fSendUrgentOffset = fSendUnacknowledged.Number();
2113 // Keep urgent offset updated, so that it doesn't reach into our
2114 // send window on overlap
2115 segment.urgent_offset = 0;
2116 }
2117
2118 return segment;
2119 }
2120
2121
2122 status_t
_PrepareAndSend(tcp_segment_header & segment,net_buffer * buffer,bool isRetransmit)2123 TCPEndpoint::_PrepareAndSend(tcp_segment_header& segment, net_buffer* buffer,
2124 bool isRetransmit)
2125 {
2126 LocalAddress().CopyTo(buffer->source);
2127 PeerAddress().CopyTo(buffer->destination);
2128
2129 uint32 size = buffer->size, segmentLength = size;
2130 segment.sequence = fSendNext.Number();
2131
2132 TRACE("_PrepareAndSend(): buffer %p (%" B_PRIu32 " bytes) address %s to "
2133 "%s flags %#" B_PRIx8 ", seq %" B_PRIu32 ", ack %" B_PRIu32
2134 ", rwnd %" B_PRIu16 ", cwnd %" B_PRIu32 ", ssthresh %" B_PRIu32
2135 ", len %" B_PRIu32 ", first %" B_PRIu32 ", last %" B_PRIu32,
2136 buffer, buffer->size, PrintAddress(buffer->source),
2137 PrintAddress(buffer->destination), segment.flags, segment.sequence,
2138 segment.acknowledge, segment.advertised_window,
2139 fCongestionWindow, fSlowStartThreshold, segmentLength,
2140 fSendQueue.FirstSequence().Number(),
2141 fSendQueue.LastSequence().Number());
2142 T(Send(this, segment, buffer, fSendQueue.FirstSequence(),
2143 fSendQueue.LastSequence()));
2144
2145 PROBE(buffer, sendWindow);
2146
2147 status_t status = add_tcp_header(AddressModule(), segment, buffer);
2148 if (status != B_OK) {
2149 gBufferModule->free(buffer);
2150 return status;
2151 }
2152
2153 if (segment.flags & TCP_FLAG_SYNCHRONIZE) {
2154 segment.options &= ~TCP_HAS_WINDOW_SCALE;
2155 segment.max_segment_size = 0;
2156 size++;
2157 }
2158
2159 if (segment.flags & TCP_FLAG_FINISH)
2160 size++;
2161
2162 status = next->module->send_routed_data(next, fRoute, buffer);
2163 if (status < B_OK) {
2164 gBufferModule->free(buffer);
2165 return status;
2166 }
2167
2168 fSendNext += size;
2169 if (fSendMax < fSendNext)
2170 fSendMax = fSendNext;
2171
2172 fReceiveMaxAdvertised = fReceiveNext + segment.AdvertisedWindow(fReceiveWindowShift);
2173
2174 if (segmentLength != 0 && fState == ESTABLISHED)
2175 --fSendMaxSegments;
2176
2177 if (fSendTime == 0 && !isRetransmit
2178 && (segmentLength != 0 || (segment.flags & TCP_FLAG_SYNCHRONIZE) != 0)) {
2179 fSendTime = tcp_now();
2180 fRoundTripStartSequence = segment.sequence;
2181 }
2182
2183 if (segment.flags & TCP_FLAG_ACKNOWLEDGE) {
2184 fLastAcknowledgeSent = segment.acknowledge;
2185 gStackModule->cancel_timer(&fDelayedAcknowledgeTimer);
2186 }
2187
2188 return B_OK;
2189 }
2190
2191
2192 inline bool
_ShouldSendSegment(tcp_segment_header & segment,uint32 length,uint32 segmentMaxSize,uint32 flightSize)2193 TCPEndpoint::_ShouldSendSegment(tcp_segment_header& segment, uint32 length,
2194 uint32 segmentMaxSize, uint32 flightSize)
2195 {
2196 if (fState == ESTABLISHED && fSendMaxSegments == 0)
2197 return false;
2198
2199 if (length > 0) {
2200 // Avoid the silly window syndrome - we only send a segment in case:
2201 // - we have a full segment to send, or
2202 // - we're at the end of our buffer queue, or
2203 // - the buffer is at least larger than half of the maximum send window,
2204 // or
2205 // - we're retransmitting data
2206 if (length == segmentMaxSize
2207 || (fOptions & TCP_NODELAY) != 0
2208 || tcp_sequence(fSendNext + length) == fSendQueue.LastSequence()
2209 || (fSendMaxWindow > 0 && length >= fSendMaxWindow / 2))
2210 return true;
2211 }
2212
2213 // check if we need to send a window update to the peer
2214 if (segment.advertised_window > 0) {
2215 // correct the window to take into account what already has been advertised
2216 uint32 window = segment.AdvertisedWindow(fReceiveWindowShift)
2217 - (fReceiveMaxAdvertised - fReceiveNext).Number();
2218 if (fReceiveNext == (fReceiveMaxAdvertised + 1))
2219 window = 0;
2220
2221 // if we can advertise a window larger than twice the maximum segment
2222 // size, or half the maximum buffer size we send a window update
2223 if (window >= (fReceiveMaxSegmentSize << 1)
2224 || window >= (socket->receive.buffer_size >> 1))
2225 return true;
2226 }
2227
2228 if ((segment.flags & (TCP_FLAG_SYNCHRONIZE | TCP_FLAG_FINISH
2229 | TCP_FLAG_RESET)) != 0)
2230 return true;
2231
2232 // We do have urgent data pending
2233 if (fSendUrgentOffset > fSendNext)
2234 return true;
2235
2236 // there is no reason to send a segment just now
2237 return false;
2238 }
2239
2240
2241 status_t
_SendAcknowledge(bool force)2242 TCPEndpoint::_SendAcknowledge(bool force)
2243 {
2244 if (fRoute == NULL || fState == LISTEN)
2245 return B_ERROR;
2246
2247 tcp_segment_header segment = _PrepareSendSegment();
2248
2249 // Is there actually anything to do?
2250 if (!force && fState == ESTABLISHED
2251 && fLastAcknowledgeSent == fReceiveNext
2252 && fReceiveQueue.IsContiguous()
2253 && !_ShouldSendSegment(segment, 0, 0, 0))
2254 return B_OK;
2255
2256 net_buffer* buffer = gBufferModule->create(256);
2257 if (buffer == NULL)
2258 return B_NO_MEMORY;
2259
2260 return _PrepareAndSend(segment, buffer, false);
2261 }
2262
2263
2264 /*! Sends one or more TCP segments with the data waiting in the queue. */
2265 status_t
_SendQueued(bool force)2266 TCPEndpoint::_SendQueued(bool force)
2267 {
2268 if (fRoute == NULL || fState < ESTABLISHED)
2269 return B_ERROR;
2270
2271 tcp_segment_header segment = _PrepareSendSegment();
2272
2273 uint32 sendWindow = fSendWindow;
2274 if (fCongestionWindow > 0 && fCongestionWindow < sendWindow)
2275 sendWindow = fCongestionWindow;
2276
2277 // fSendUnacknowledged
2278 // | fSendNext fSendMax
2279 // | | |
2280 // v v v
2281 // -----------------------------------
2282 // | effective window |
2283 // -----------------------------------
2284
2285 // Flight size represents the window of data which is currently in the
2286 // ether. We should never send data such as the flight size becomes larger
2287 // than the effective window. Note however that the effective window may be
2288 // reduced (by congestion for instance), so at some point in time flight
2289 // size may be larger than the currently calculated window.
2290
2291 uint32 flightSize = (fSendMax - fSendUnacknowledged).Number();
2292 uint32 consumedWindow = (fSendNext - fSendUnacknowledged).Number();
2293
2294 if (consumedWindow > sendWindow) {
2295 sendWindow = 0;
2296 // TODO: enter persist state? try to get a window update.
2297 } else
2298 sendWindow -= consumedWindow;
2299
2300 uint32 length = min_c(fSendQueue.Available(fSendNext), sendWindow);
2301 if (length == 0 && !state_needs_finish(fState)) {
2302 // Nothing to send.
2303 return B_OK;
2304 }
2305
2306 bool shouldStartRetransmitTimer = fSendNext == fSendUnacknowledged;
2307 bool retransmit = fSendNext < fSendMax;
2308
2309 if (fDuplicateAcknowledgeCount != 0) {
2310 // send at most 1 SMSS of data when under limited transmit, fast transmit/recovery
2311 length = min_c(length, fSendMaxSegmentSize);
2312 }
2313
2314 do {
2315 uint32 segmentMaxSize = fSendMaxSegmentSize
2316 - tcp_options_length(segment);
2317 uint32 segmentLength = min_c(length, segmentMaxSize);
2318
2319 if ((fSendNext + segmentLength) == fSendQueue.LastSequence() && !force) {
2320 if (state_needs_finish(fState))
2321 segment.flags |= TCP_FLAG_FINISH;
2322 if (length > 0)
2323 segment.flags |= TCP_FLAG_PUSH;
2324 }
2325
2326 // Determine if we should really send this segment
2327 if (!force && !retransmit && !_ShouldSendSegment(segment, segmentLength,
2328 segmentMaxSize, flightSize)) {
2329 if (fSendQueue.Available()
2330 && !gStackModule->is_timer_active(&fPersistTimer)
2331 && !gStackModule->is_timer_active(&fRetransmitTimer))
2332 _StartPersistTimer();
2333 break;
2334 }
2335
2336 net_buffer *buffer = gBufferModule->create(256);
2337 if (buffer == NULL)
2338 return B_NO_MEMORY;
2339
2340 status_t status = B_OK;
2341 if (segmentLength > 0)
2342 status = fSendQueue.Get(buffer, fSendNext, segmentLength);
2343 if (status < B_OK) {
2344 gBufferModule->free(buffer);
2345 return status;
2346 }
2347
2348 sendWindow -= buffer->size;
2349
2350 status = _PrepareAndSend(segment, buffer, retransmit);
2351 if (status != B_OK)
2352 return status;
2353
2354 if (shouldStartRetransmitTimer) {
2355 TRACE("starting initial retransmit timer of: %" B_PRIdBIGTIME,
2356 fRetransmitTimeout);
2357 gStackModule->set_timer(&fRetransmitTimer, fRetransmitTimeout);
2358 T(TimerSet(this, "retransmit", fRetransmitTimeout));
2359 shouldStartRetransmitTimer = false;
2360 }
2361
2362 length -= segmentLength;
2363 segment.flags &= ~(TCP_FLAG_SYNCHRONIZE | TCP_FLAG_RESET
2364 | TCP_FLAG_FINISH);
2365
2366 if (retransmit)
2367 break;
2368
2369 } while (length > 0);
2370
2371 return B_OK;
2372 }
2373
2374
2375 int
_MaxSegmentSize(const sockaddr * address) const2376 TCPEndpoint::_MaxSegmentSize(const sockaddr* address) const
2377 {
2378 return next->module->get_mtu(next, address) - sizeof(tcp_header);
2379 }
2380
2381
2382 status_t
_PrepareSendPath(const sockaddr * peer)2383 TCPEndpoint::_PrepareSendPath(const sockaddr* peer)
2384 {
2385 if (fRoute == NULL) {
2386 fRoute = gDatalinkModule->get_route(Domain(), peer);
2387 if (fRoute == NULL)
2388 return ENETUNREACH;
2389
2390 if ((fRoute->flags & RTF_LOCAL) != 0)
2391 fFlags |= FLAG_LOCAL;
2392 }
2393
2394 // make sure connection does not already exist
2395 status_t status = fManager->SetConnection(this, *LocalAddress(), peer,
2396 fRoute->interface_address->local);
2397 if (status < B_OK)
2398 return status;
2399
2400 fInitialSendSequence = system_time() >> 4;
2401 fSendNext = fInitialSendSequence;
2402 fSendUnacknowledged = fInitialSendSequence;
2403 fSendMax = fInitialSendSequence;
2404 fSendUrgentOffset = fInitialSendSequence;
2405 fRecover = fInitialSendSequence.Number();
2406
2407 // we are counting the SYN here
2408 fSendQueue.SetInitialSequence(fSendNext + 1);
2409
2410 fReceiveMaxSegmentSize = _MaxSegmentSize(peer);
2411
2412 // Compute the window shift we advertise to our peer - if it doesn't support
2413 // this option, this will be reset to 0 (when its SYN is received)
2414 fReceiveWindowShift = 0;
2415 while (fReceiveWindowShift < TCP_MAX_WINDOW_SHIFT
2416 && (0xffffUL << fReceiveWindowShift) < socket->receive.buffer_size) {
2417 fReceiveWindowShift++;
2418 }
2419
2420 // Increase to a default of 8 (window minimum 256 bytes, maximum 15 MB.)
2421 if (fReceiveWindowShift < 8 && !IsLocal())
2422 fReceiveWindowShift = 8;
2423
2424 return B_OK;
2425 }
2426
2427
2428 void
_Acknowledged(tcp_segment_header & segment)2429 TCPEndpoint::_Acknowledged(tcp_segment_header& segment)
2430 {
2431 TRACE("_Acknowledged(): ack %" B_PRIu32 "; uack %" B_PRIu32 "; next %"
2432 B_PRIu32 "; max %" B_PRIu32, segment.acknowledge,
2433 fSendUnacknowledged.Number(), fSendNext.Number(), fSendMax.Number());
2434
2435 ASSERT(fSendUnacknowledged <= segment.acknowledge);
2436
2437 if (fSendUnacknowledged < segment.acknowledge) {
2438 fSendQueue.RemoveUntil(segment.acknowledge);
2439
2440 uint32 bytesAcknowledged = segment.acknowledge - fSendUnacknowledged.Number();
2441 fPreviousHighestAcknowledge = fSendUnacknowledged;
2442 fSendUnacknowledged = segment.acknowledge;
2443 uint32 flightSize = (fSendMax - fSendUnacknowledged).Number();
2444 int32 expectedSamples = flightSize / (fSendMaxSegmentSize << 1);
2445
2446 if (fPreviousHighestAcknowledge > fSendUnacknowledged) {
2447 // need to update the recover variable upon a sequence wraparound
2448 fRecover = segment.acknowledge - 1;
2449 }
2450
2451 // the acknowledgment of the SYN/ACK MUST NOT increase the size of the congestion window
2452 if (fSendUnacknowledged != fInitialSendSequence) {
2453 if (fCongestionWindow < fSlowStartThreshold)
2454 fCongestionWindow += min_c(bytesAcknowledged, fSendMaxSegmentSize);
2455 else {
2456 uint32 increment = fSendMaxSegmentSize * fSendMaxSegmentSize;
2457
2458 if (increment < fCongestionWindow)
2459 increment = 1;
2460 else
2461 increment /= fCongestionWindow;
2462
2463 fCongestionWindow += increment;
2464 }
2465
2466 fSendMaxSegments = UINT32_MAX;
2467 }
2468
2469 if ((fFlags & FLAG_RECOVERY) != 0) {
2470 fSendNext = fSendUnacknowledged;
2471 _SendQueued();
2472 fCongestionWindow -= bytesAcknowledged;
2473
2474 if (bytesAcknowledged > fSendMaxSegmentSize)
2475 fCongestionWindow += fSendMaxSegmentSize;
2476
2477 fSendNext = fSendMax;
2478 } else
2479 fDuplicateAcknowledgeCount = 0;
2480
2481 if (fSendNext < fSendUnacknowledged)
2482 fSendNext = fSendUnacknowledged;
2483
2484 if (fFlags & FLAG_OPTION_TIMESTAMP) {
2485 _UpdateRoundTripTime(tcp_diff_timestamp(segment.timestamp_reply),
2486 expectedSamples > 0 ? expectedSamples : 1);
2487 } else if (fSendTime != 0 && fRoundTripStartSequence < segment.acknowledge) {
2488 _UpdateRoundTripTime(tcp_diff_timestamp(fSendTime), 1);
2489 fSendTime = 0;
2490 }
2491
2492 if (fSendUnacknowledged == fSendMax) {
2493 TRACE("all acknowledged, cancelling retransmission timer.");
2494 gStackModule->cancel_timer(&fRetransmitTimer);
2495 T(TimerSet(this, "retransmit", -1));
2496 } else {
2497 TRACE("data acknowledged, resetting retransmission timer to: %"
2498 B_PRIdBIGTIME, fRetransmitTimeout);
2499 gStackModule->set_timer(&fRetransmitTimer, fRetransmitTimeout);
2500 T(TimerSet(this, "retransmit", fRetransmitTimeout));
2501 }
2502
2503 if (is_writable(fState)) {
2504 // notify threads waiting on the socket to become writable again
2505 fSendCondition.NotifyAll();
2506 gSocketModule->notify(socket, B_SELECT_WRITE, fSendQueue.Free());
2507 }
2508 }
2509
2510 // if there is data left to be sent, send it now
2511 if (fSendQueue.Used() > 0)
2512 _SendQueued();
2513 }
2514
2515
2516 void
_Retransmit()2517 TCPEndpoint::_Retransmit()
2518 {
2519 TRACE("Retransmit()");
2520
2521 if (fState < ESTABLISHED) {
2522 fRetransmitTimeout = TCP_SYN_RETRANSMIT_TIMEOUT;
2523 fCongestionWindow = fSendMaxSegmentSize;
2524 } else {
2525 _ResetSlowStart();
2526 fDuplicateAcknowledgeCount = 0;
2527 // Do exponential back off of the retransmit timeout
2528 fRetransmitTimeout *= 2;
2529 if (fRetransmitTimeout > TCP_MAX_RETRANSMIT_TIMEOUT)
2530 fRetransmitTimeout = TCP_MAX_RETRANSMIT_TIMEOUT;
2531 }
2532
2533 fSendNext = fSendUnacknowledged;
2534 _SendQueued();
2535
2536 fRecover = fSendNext.Number() - 1;
2537 if ((fFlags & FLAG_RECOVERY) != 0)
2538 fFlags &= ~FLAG_RECOVERY;
2539 }
2540
2541
2542 void
_UpdateRoundTripTime(int32 roundTripTime,int32 expectedSamples)2543 TCPEndpoint::_UpdateRoundTripTime(int32 roundTripTime, int32 expectedSamples)
2544 {
2545 if (roundTripTime < 0)
2546 return;
2547
2548 if (fSmoothedRoundTripTime <= 0) {
2549 fSmoothedRoundTripTime = roundTripTime;
2550 fRoundTripVariation = roundTripTime / 2;
2551 fRetransmitTimeout = (fSmoothedRoundTripTime + max_c(100, fRoundTripVariation * 4))
2552 * kTimestampFactor;
2553 } else {
2554 int32 delta = fSmoothedRoundTripTime - roundTripTime;
2555 if (delta < 0)
2556 delta = -delta;
2557 fRoundTripVariation += (delta - fRoundTripVariation) / (expectedSamples * 4);
2558 fSmoothedRoundTripTime += (roundTripTime - fSmoothedRoundTripTime) / (expectedSamples * 8);
2559 fRetransmitTimeout = (fSmoothedRoundTripTime + max_c(100, fRoundTripVariation * 4))
2560 * kTimestampFactor;
2561 }
2562
2563 if (fRetransmitTimeout > TCP_MAX_RETRANSMIT_TIMEOUT)
2564 fRetransmitTimeout = TCP_MAX_RETRANSMIT_TIMEOUT;
2565
2566 if (fRetransmitTimeout < TCP_MIN_RETRANSMIT_TIMEOUT)
2567 fRetransmitTimeout = TCP_MIN_RETRANSMIT_TIMEOUT;
2568
2569 TRACE(" RTO is now %" B_PRIdBIGTIME " (after rtt %" B_PRId32 "ms)",
2570 fRetransmitTimeout, roundTripTime);
2571 }
2572
2573
2574 void
_ResetSlowStart()2575 TCPEndpoint::_ResetSlowStart()
2576 {
2577 fSlowStartThreshold = max_c((fSendMax - fSendUnacknowledged).Number() / 2,
2578 2 * fSendMaxSegmentSize);
2579 fCongestionWindow = fSendMaxSegmentSize;
2580 }
2581
2582
2583 // #pragma mark - timer
2584
2585
2586 /*static*/ void
_RetransmitTimer(net_timer * timer,void * _endpoint)2587 TCPEndpoint::_RetransmitTimer(net_timer* timer, void* _endpoint)
2588 {
2589 TCPEndpoint* endpoint = (TCPEndpoint*)_endpoint;
2590 T(TimerTriggered(endpoint, "retransmit"));
2591
2592 MutexLocker locker(endpoint->fLock);
2593 if (!locker.IsLocked() || gStackModule->is_timer_active(timer))
2594 return;
2595
2596 endpoint->_Retransmit();
2597 }
2598
2599
2600 /*static*/ void
_PersistTimer(net_timer * timer,void * _endpoint)2601 TCPEndpoint::_PersistTimer(net_timer* timer, void* _endpoint)
2602 {
2603 TCPEndpoint* endpoint = (TCPEndpoint*)_endpoint;
2604 T(TimerTriggered(endpoint, "persist"));
2605
2606 MutexLocker locker(endpoint->fLock);
2607 if (!locker.IsLocked())
2608 return;
2609
2610 // the timer might not have been canceled early enough
2611 if (endpoint->State() == CLOSED)
2612 return;
2613
2614 endpoint->_SendQueued(true);
2615 }
2616
2617
2618 /*static*/ void
_DelayedAcknowledgeTimer(net_timer * timer,void * _endpoint)2619 TCPEndpoint::_DelayedAcknowledgeTimer(net_timer* timer, void* _endpoint)
2620 {
2621 TCPEndpoint* endpoint = (TCPEndpoint*)_endpoint;
2622 T(TimerTriggered(endpoint, "delayed ack"));
2623
2624 MutexLocker locker(endpoint->fLock);
2625 if (!locker.IsLocked())
2626 return;
2627
2628 // the timer might not have been canceled early enough
2629 if (endpoint->State() == CLOSED)
2630 return;
2631
2632 endpoint->_SendAcknowledge();
2633 }
2634
2635
2636 /*static*/ void
_TimeWaitTimer(net_timer * timer,void * _endpoint)2637 TCPEndpoint::_TimeWaitTimer(net_timer* timer, void* _endpoint)
2638 {
2639 TCPEndpoint* endpoint = (TCPEndpoint*)_endpoint;
2640 T(TimerTriggered(endpoint, "time-wait"));
2641
2642 MutexLocker locker(endpoint->fLock);
2643 if (!locker.IsLocked())
2644 return;
2645
2646 if ((endpoint->fFlags & FLAG_CLOSED) == 0) {
2647 endpoint->fFlags |= FLAG_DELETE_ON_CLOSE;
2648 return;
2649 }
2650
2651 locker.Unlock();
2652
2653 gSocketModule->release_socket(endpoint->socket);
2654 }
2655
2656
2657 /*static*/ status_t
_WaitForCondition(ConditionVariable & condition,MutexLocker & locker,bigtime_t timeout)2658 TCPEndpoint::_WaitForCondition(ConditionVariable& condition,
2659 MutexLocker& locker, bigtime_t timeout)
2660 {
2661 ConditionVariableEntry entry;
2662 condition.Add(&entry);
2663
2664 locker.Unlock();
2665 status_t result = entry.Wait(B_ABSOLUTE_TIMEOUT | B_CAN_INTERRUPT, timeout);
2666 locker.Lock();
2667
2668 return result;
2669 }
2670
2671
2672 // #pragma mark -
2673
2674
2675 void
Dump() const2676 TCPEndpoint::Dump() const
2677 {
2678 kprintf("TCP endpoint %p\n", this);
2679 kprintf(" socket: %p\n", socket);
2680 kprintf(" state: %s\n", name_for_state(fState));
2681 kprintf(" flags: 0x%" B_PRIx32 "\n", fFlags);
2682 #if KDEBUG
2683 kprintf(" lock: { %p, holder: %" B_PRId32 " }\n", &fLock, fLock.holder);
2684 #endif
2685 kprintf(" accept sem: %" B_PRId32 "\n", fAcceptSemaphore);
2686 kprintf(" options: 0x%" B_PRIx32 "\n", (uint32)fOptions);
2687 kprintf(" send\n");
2688 kprintf(" window shift: %" B_PRIu8 "\n", fSendWindowShift);
2689 kprintf(" unacknowledged: %" B_PRIu32 "\n",
2690 fSendUnacknowledged.Number());
2691 kprintf(" next: %" B_PRIu32 "\n", fSendNext.Number());
2692 kprintf(" max: %" B_PRIu32 "\n", fSendMax.Number());
2693 kprintf(" urgent offset: %" B_PRIu32 "\n", fSendUrgentOffset.Number());
2694 kprintf(" window: %" B_PRIu32 "\n", fSendWindow);
2695 kprintf(" max window: %" B_PRIu32 "\n", fSendMaxWindow);
2696 kprintf(" max segment size: %" B_PRIu32 "\n", fSendMaxSegmentSize);
2697 kprintf(" queue: %" B_PRIuSIZE " / %" B_PRIuSIZE "\n", fSendQueue.Used(),
2698 fSendQueue.Size());
2699 #if DEBUG_TCP_BUFFER_QUEUE
2700 fSendQueue.Dump();
2701 #endif
2702 kprintf(" last acknowledge sent: %" B_PRIu32 "\n",
2703 fLastAcknowledgeSent.Number());
2704 kprintf(" initial sequence: %" B_PRIu32 "\n",
2705 fInitialSendSequence.Number());
2706 kprintf(" receive\n");
2707 kprintf(" window shift: %" B_PRIu8 "\n", fReceiveWindowShift);
2708 kprintf(" next: %" B_PRIu32 "\n", fReceiveNext.Number());
2709 kprintf(" max advertised: %" B_PRIu32 "\n",
2710 fReceiveMaxAdvertised.Number());
2711 kprintf(" window: %" B_PRIu32 "\n", fReceiveWindow);
2712 kprintf(" max segment size: %" B_PRIu32 "\n", fReceiveMaxSegmentSize);
2713 kprintf(" queue: %" B_PRIuSIZE " / %" B_PRIuSIZE "\n",
2714 fReceiveQueue.Available(), fReceiveQueue.Size());
2715 #if DEBUG_TCP_BUFFER_QUEUE
2716 fReceiveQueue.Dump();
2717 #endif
2718 kprintf(" initial sequence: %" B_PRIu32 "\n",
2719 fInitialReceiveSequence.Number());
2720 kprintf(" duplicate acknowledge count: %" B_PRIu32 "\n",
2721 fDuplicateAcknowledgeCount);
2722 kprintf(" smoothed round trip time: %" B_PRId32 " (deviation %" B_PRId32 ")\n",
2723 fSmoothedRoundTripTime, fRoundTripVariation);
2724 kprintf(" retransmit timeout: %" B_PRId64 "\n", fRetransmitTimeout);
2725 kprintf(" congestion window: %" B_PRIu32 "\n", fCongestionWindow);
2726 kprintf(" slow start threshold: %" B_PRIu32 "\n", fSlowStartThreshold);
2727 }
2728
2729