1 /* 2 * Copyright 2006-2010, Haiku, Inc. All Rights Reserved. 3 * Distributed under the terms of the MIT License. 4 * 5 * Authors: 6 * Andrew Galante, haiku.galante@gmail.com 7 * Axel Dörfler, axeld@pinc-software.de 8 * Hugo Santos, hugosantos@gmail.com 9 */ 10 11 12 #include "TCPEndpoint.h" 13 14 #include <netinet/in.h> 15 #include <netinet/ip.h> 16 #include <netinet/tcp.h> 17 #include <new> 18 #include <signal.h> 19 #include <stdlib.h> 20 #include <string.h> 21 22 #include <KernelExport.h> 23 #include <Select.h> 24 25 #include <net_buffer.h> 26 #include <net_datalink.h> 27 #include <net_stat.h> 28 #include <NetBufferUtilities.h> 29 #include <NetUtilities.h> 30 31 #include <lock.h> 32 #include <tracing.h> 33 #include <util/AutoLock.h> 34 #include <util/list.h> 35 36 #include "EndpointManager.h" 37 38 39 // References: 40 // - RFC 793 - Transmission Control Protocol 41 // - RFC 813 - Window and Acknowledgement Strategy in TCP 42 // - RFC 1337 - TIME_WAIT Assassination Hazards in TCP 43 // 44 // Things this implementation currently doesn't implement: 45 // - TCP Slow Start, Congestion Avoidance, Fast Retransmit, and Fast Recovery, 46 // RFC 2001, RFC 2581, RFC 3042 47 // - NewReno Modification to TCP's Fast Recovery, RFC 2582 48 // - Explicit Congestion Notification (ECN), RFC 3168 49 // - SYN-Cache 50 // - TCP Extensions for High Performance, RFC 1323 51 // - SACK, Selective Acknowledgment - RFC 2018, RFC 2883, RFC 3517 52 // - Forward RTO-Recovery, RFC 4138 53 // - Time-Wait hash instead of keeping sockets alive 54 55 #define PrintAddress(address) \ 56 AddressString(Domain(), address, true).Data() 57 58 //#define TRACE_TCP 59 //#define PROBE_TCP 60 61 #ifdef TRACE_TCP 62 // the space before ', ##args' is important in order for this to work with cpp 2.95 63 # define TRACE(format, args...) dprintf("%" B_PRId32 ": TCP [%" \ 64 B_PRIdBIGTIME "] %p (%12s) " format "\n", find_thread(NULL), \ 65 system_time(), this, name_for_state(fState) , ##args) 66 #else 67 # define TRACE(args...) do { } while (0) 68 #endif 69 70 #ifdef PROBE_TCP 71 # define PROBE(buffer, window) \ 72 dprintf("TCP PROBE %" B_PRIdBIGTIME " %s %s %" B_PRIu32 " snxt %" B_PRIu32 \ 73 " suna %" B_PRIu32 " cw %" B_PRIu32 " sst %" B_PRIu32 " win %" \ 74 B_PRIu32 " swin %" B_PRIu32 " smax-suna %" B_PRIu32 " savail %" \ 75 B_PRIuSIZE " sqused %" B_PRIuSIZE " rto %" B_PRIdBIGTIME "\n", \ 76 system_time(), PrintAddress(buffer->source), \ 77 PrintAddress(buffer->destination), buffer->size, fSendNext.Number(), \ 78 fSendUnacknowledged.Number(), fCongestionWindow, fSlowStartThreshold, \ 79 window, fSendWindow, (fSendMax - fSendUnacknowledged).Number(), \ 80 fSendQueue.Available(fSendNext), fSendQueue.Used(), fRetransmitTimeout) 81 #else 82 # define PROBE(buffer, window) do { } while (0) 83 #endif 84 85 #if TCP_TRACING 86 namespace TCPTracing { 87 88 class Receive : public AbstractTraceEntry { 89 public: 90 Receive(TCPEndpoint* endpoint, tcp_segment_header& segment, uint32 window, 91 net_buffer* buffer) 92 : 93 fEndpoint(endpoint), 94 fBuffer(buffer), 95 fBufferSize(buffer->size), 96 fSequence(segment.sequence), 97 fAcknowledge(segment.acknowledge), 98 fWindow(window), 99 fState(endpoint->State()), 100 fFlags(segment.flags) 101 { 102 Initialized(); 103 } 104 105 virtual void AddDump(TraceOutput& out) 106 { 107 out.Print("tcp:%p (%12s) receive buffer %p (%" B_PRIu32 " bytes), " 108 "flags %#" B_PRIx8 ", seq %" B_PRIu32 ", ack %" B_PRIu32 109 ", wnd %" B_PRIu32, fEndpoint, name_for_state(fState), fBuffer, 110 fBufferSize, fFlags, fSequence, fAcknowledge, fWindow); 111 } 112 113 protected: 114 TCPEndpoint* fEndpoint; 115 net_buffer* fBuffer; 116 uint32 fBufferSize; 117 uint32 fSequence; 118 uint32 fAcknowledge; 119 uint32 fWindow; 120 tcp_state fState; 121 uint8 fFlags; 122 }; 123 124 class Send : public AbstractTraceEntry { 125 public: 126 Send(TCPEndpoint* endpoint, tcp_segment_header& segment, net_buffer* buffer, 127 tcp_sequence firstSequence, tcp_sequence lastSequence) 128 : 129 fEndpoint(endpoint), 130 fBuffer(buffer), 131 fBufferSize(buffer->size), 132 fSequence(segment.sequence), 133 fAcknowledge(segment.acknowledge), 134 fFirstSequence(firstSequence.Number()), 135 fLastSequence(lastSequence.Number()), 136 fState(endpoint->State()), 137 fFlags(segment.flags) 138 { 139 Initialized(); 140 } 141 142 virtual void AddDump(TraceOutput& out) 143 { 144 out.Print("tcp:%p (%12s) send buffer %p (%" B_PRIu32 " bytes), " 145 "flags %#" B_PRIx8 ", seq %" B_PRIu32 ", ack %" B_PRIu32 146 ", first %" B_PRIu32 ", last %" B_PRIu32, fEndpoint, 147 name_for_state(fState), fBuffer, fBufferSize, fFlags, fSequence, 148 fAcknowledge, fFirstSequence, fLastSequence); 149 } 150 151 protected: 152 TCPEndpoint* fEndpoint; 153 net_buffer* fBuffer; 154 uint32 fBufferSize; 155 uint32 fSequence; 156 uint32 fAcknowledge; 157 uint32 fFirstSequence; 158 uint32 fLastSequence; 159 tcp_state fState; 160 uint8 fFlags; 161 }; 162 163 class State : public AbstractTraceEntry { 164 public: 165 State(TCPEndpoint* endpoint) 166 : 167 fEndpoint(endpoint), 168 fState(endpoint->State()) 169 { 170 Initialized(); 171 } 172 173 virtual void AddDump(TraceOutput& out) 174 { 175 out.Print("tcp:%p (%12s) state change", fEndpoint, 176 name_for_state(fState)); 177 } 178 179 protected: 180 TCPEndpoint* fEndpoint; 181 tcp_state fState; 182 }; 183 184 class Spawn : public AbstractTraceEntry { 185 public: 186 Spawn(TCPEndpoint* listeningEndpoint, TCPEndpoint* spawnedEndpoint) 187 : 188 fListeningEndpoint(listeningEndpoint), 189 fSpawnedEndpoint(spawnedEndpoint) 190 { 191 Initialized(); 192 } 193 194 virtual void AddDump(TraceOutput& out) 195 { 196 out.Print("tcp:%p spawns %p", fListeningEndpoint, fSpawnedEndpoint); 197 } 198 199 protected: 200 TCPEndpoint* fListeningEndpoint; 201 TCPEndpoint* fSpawnedEndpoint; 202 }; 203 204 class Error : public AbstractTraceEntry { 205 public: 206 Error(TCPEndpoint* endpoint, const char* error, int32 line) 207 : 208 fEndpoint(endpoint), 209 fLine(line), 210 fError(error), 211 fState(endpoint->State()) 212 { 213 Initialized(); 214 } 215 216 virtual void AddDump(TraceOutput& out) 217 { 218 out.Print("tcp:%p (%12s) error at line %" B_PRId32 ": %s", fEndpoint, 219 name_for_state(fState), fLine, fError); 220 } 221 222 protected: 223 TCPEndpoint* fEndpoint; 224 int32 fLine; 225 const char* fError; 226 tcp_state fState; 227 }; 228 229 class TimerSet : public AbstractTraceEntry { 230 public: 231 TimerSet(TCPEndpoint* endpoint, const char* which, bigtime_t timeout) 232 : 233 fEndpoint(endpoint), 234 fWhich(which), 235 fTimeout(timeout), 236 fState(endpoint->State()) 237 { 238 Initialized(); 239 } 240 241 virtual void AddDump(TraceOutput& out) 242 { 243 out.Print("tcp:%p (%12s) %s timer set to %" B_PRIdBIGTIME, fEndpoint, 244 name_for_state(fState), fWhich, fTimeout); 245 } 246 247 protected: 248 TCPEndpoint* fEndpoint; 249 const char* fWhich; 250 bigtime_t fTimeout; 251 tcp_state fState; 252 }; 253 254 class TimerTriggered : public AbstractTraceEntry { 255 public: 256 TimerTriggered(TCPEndpoint* endpoint, const char* which) 257 : 258 fEndpoint(endpoint), 259 fWhich(which), 260 fState(endpoint->State()) 261 { 262 Initialized(); 263 } 264 265 virtual void AddDump(TraceOutput& out) 266 { 267 out.Print("tcp:%p (%12s) %s timer triggered", fEndpoint, 268 name_for_state(fState), fWhich); 269 } 270 271 protected: 272 TCPEndpoint* fEndpoint; 273 const char* fWhich; 274 tcp_state fState; 275 }; 276 277 class APICall : public AbstractTraceEntry { 278 public: 279 APICall(TCPEndpoint* endpoint, const char* which) 280 : 281 fEndpoint(endpoint), 282 fWhich(which), 283 fState(endpoint->State()) 284 { 285 Initialized(); 286 } 287 288 virtual void AddDump(TraceOutput& out) 289 { 290 out.Print("tcp:%p (%12s) api call: %s", fEndpoint, 291 name_for_state(fState), fWhich); 292 } 293 294 protected: 295 TCPEndpoint* fEndpoint; 296 const char* fWhich; 297 tcp_state fState; 298 }; 299 300 } // namespace TCPTracing 301 302 # define T(x) new(std::nothrow) TCPTracing::x 303 #else 304 # define T(x) 305 #endif // TCP_TRACING 306 307 308 // constants for the fFlags field 309 enum { 310 FLAG_OPTION_WINDOW_SCALE = 0x01, 311 FLAG_OPTION_TIMESTAMP = 0x02, 312 // TODO: Should FLAG_NO_RECEIVE apply as well to received connections? 313 // That is, what is expected from accept() after a shutdown() 314 // is performed on a listen()ing socket. 315 FLAG_NO_RECEIVE = 0x04, 316 FLAG_CLOSED = 0x08, 317 FLAG_DELETE_ON_CLOSE = 0x10, 318 FLAG_LOCAL = 0x20 319 }; 320 321 322 static const int kTimestampFactor = 1000; 323 // conversion factor between usec system time and msec tcp time 324 325 326 static inline bigtime_t 327 absolute_timeout(bigtime_t timeout) 328 { 329 if (timeout == 0 || timeout == B_INFINITE_TIMEOUT) 330 return timeout; 331 332 return timeout + system_time(); 333 } 334 335 336 static inline status_t 337 posix_error(status_t error) 338 { 339 if (error == B_TIMED_OUT) 340 return B_WOULD_BLOCK; 341 342 return error; 343 } 344 345 346 static inline bool 347 in_window(const tcp_sequence& sequence, const tcp_sequence& receiveNext, 348 uint32 receiveWindow) 349 { 350 return sequence >= receiveNext && sequence < (receiveNext + receiveWindow); 351 } 352 353 354 static inline bool 355 segment_in_sequence(const tcp_segment_header& segment, int size, 356 const tcp_sequence& receiveNext, uint32 receiveWindow) 357 { 358 tcp_sequence sequence(segment.sequence); 359 if (size == 0) { 360 if (receiveWindow == 0) 361 return sequence == receiveNext; 362 return in_window(sequence, receiveNext, receiveWindow); 363 } else { 364 if (receiveWindow == 0) 365 return false; 366 return in_window(sequence, receiveNext, receiveWindow) 367 || in_window(sequence + size - 1, receiveNext, receiveWindow); 368 } 369 } 370 371 372 static inline bool 373 is_writable(tcp_state state) 374 { 375 return state == ESTABLISHED || state == FINISH_RECEIVED; 376 } 377 378 379 static inline bool 380 is_establishing(tcp_state state) 381 { 382 return state == SYNCHRONIZE_SENT || state == SYNCHRONIZE_RECEIVED; 383 } 384 385 386 static inline uint32 tcp_now() 387 { 388 return system_time() / kTimestampFactor; 389 } 390 391 392 static inline uint32 tcp_diff_timestamp(uint32 base) 393 { 394 uint32 now = tcp_now(); 395 396 if (now > base) 397 return now - base; 398 399 return now + UINT_MAX - base; 400 } 401 402 403 static inline bool 404 state_needs_finish(int32 state) 405 { 406 return state == WAIT_FOR_FINISH_ACKNOWLEDGE 407 || state == FINISH_SENT || state == CLOSING; 408 } 409 410 411 // #pragma mark - 412 413 414 TCPEndpoint::TCPEndpoint(net_socket* socket) 415 : 416 ProtocolSocket(socket), 417 fManager(NULL), 418 fOptions(0), 419 fSendWindowShift(0), 420 fReceiveWindowShift(0), 421 fSendUnacknowledged(0), 422 fSendNext(0), 423 fSendMax(0), 424 fSendUrgentOffset(0), 425 fSendWindow(0), 426 fSendMaxWindow(0), 427 fSendMaxSegmentSize(TCP_DEFAULT_MAX_SEGMENT_SIZE), 428 fSendQueue(socket->send.buffer_size), 429 fInitialSendSequence(0), 430 fDuplicateAcknowledgeCount(0), 431 fRoute(NULL), 432 fReceiveNext(0), 433 fReceiveMaxAdvertised(0), 434 fReceiveWindow(socket->receive.buffer_size), 435 fReceiveMaxSegmentSize(TCP_DEFAULT_MAX_SEGMENT_SIZE), 436 fReceiveQueue(socket->receive.buffer_size), 437 fRoundTripTime(TCP_INITIAL_RTT / kTimestampFactor), 438 fRoundTripDeviation(TCP_INITIAL_RTT / kTimestampFactor), 439 fRetransmitTimeout(TCP_INITIAL_RTT), 440 fReceivedTimestamp(0), 441 fCongestionWindow(0), 442 fSlowStartThreshold(0), 443 fState(CLOSED), 444 fFlags(FLAG_OPTION_WINDOW_SCALE | FLAG_OPTION_TIMESTAMP) 445 { 446 // TODO: to be replaced with a real read/write locking strategy! 447 mutex_init(&fLock, "tcp lock"); 448 449 fReceiveCondition.Init(this, "tcp receive"); 450 fSendCondition.Init(this, "tcp send"); 451 452 gStackModule->init_timer(&fPersistTimer, TCPEndpoint::_PersistTimer, this); 453 gStackModule->init_timer(&fRetransmitTimer, TCPEndpoint::_RetransmitTimer, 454 this); 455 gStackModule->init_timer(&fDelayedAcknowledgeTimer, 456 TCPEndpoint::_DelayedAcknowledgeTimer, this); 457 gStackModule->init_timer(&fTimeWaitTimer, TCPEndpoint::_TimeWaitTimer, 458 this); 459 460 T(APICall(this, "constructor")); 461 } 462 463 464 TCPEndpoint::~TCPEndpoint() 465 { 466 mutex_lock(&fLock); 467 468 T(APICall(this, "destructor")); 469 470 _CancelConnectionTimers(); 471 gStackModule->cancel_timer(&fTimeWaitTimer); 472 T(TimerSet(this, "time-wait", -1)); 473 474 if (fManager != NULL) { 475 fManager->Unbind(this); 476 put_endpoint_manager(fManager); 477 } 478 479 mutex_destroy(&fLock); 480 481 // we need to wait for all timers to return 482 gStackModule->wait_for_timer(&fRetransmitTimer); 483 gStackModule->wait_for_timer(&fPersistTimer); 484 gStackModule->wait_for_timer(&fDelayedAcknowledgeTimer); 485 gStackModule->wait_for_timer(&fTimeWaitTimer); 486 487 gDatalinkModule->put_route(Domain(), fRoute); 488 } 489 490 491 status_t 492 TCPEndpoint::InitCheck() const 493 { 494 return B_OK; 495 } 496 497 498 // #pragma mark - protocol API 499 500 501 status_t 502 TCPEndpoint::Open() 503 { 504 TRACE("Open()"); 505 T(APICall(this, "open")); 506 507 status_t status = ProtocolSocket::Open(); 508 if (status < B_OK) 509 return status; 510 511 fManager = get_endpoint_manager(Domain()); 512 if (fManager == NULL) 513 return EAFNOSUPPORT; 514 515 return B_OK; 516 } 517 518 519 status_t 520 TCPEndpoint::Close() 521 { 522 MutexLocker locker(fLock); 523 524 TRACE("Close()"); 525 T(APICall(this, "close")); 526 527 if (fState == LISTEN) 528 delete_sem(fAcceptSemaphore); 529 530 if (fState == SYNCHRONIZE_SENT || fState == LISTEN) { 531 // TODO: what about linger in case of SYNCHRONIZE_SENT? 532 fState = CLOSED; 533 T(State(this)); 534 return B_OK; 535 } 536 537 status_t status = _Disconnect(true); 538 if (status != B_OK) 539 return status; 540 541 if (socket->options & SO_LINGER) { 542 TRACE("Close(): Lingering for %i secs", socket->linger); 543 544 bigtime_t maximum = absolute_timeout(socket->linger * 1000000LL); 545 546 while (fSendQueue.Used() > 0) { 547 status = _WaitForCondition(fSendCondition, locker, maximum); 548 if (status == B_TIMED_OUT || status == B_WOULD_BLOCK) 549 break; 550 else if (status < B_OK) 551 return status; 552 } 553 554 TRACE("Close(): after waiting, the SendQ was left with %" B_PRIuSIZE 555 " bytes.", fSendQueue.Used()); 556 } 557 return B_OK; 558 } 559 560 561 void 562 TCPEndpoint::Free() 563 { 564 MutexLocker _(fLock); 565 566 TRACE("Free()"); 567 T(APICall(this, "free")); 568 569 if (fState <= SYNCHRONIZE_SENT) 570 return; 571 572 // we are only interested in the timer, not in changing state 573 _EnterTimeWait(); 574 575 fFlags |= FLAG_CLOSED; 576 if ((fFlags & FLAG_DELETE_ON_CLOSE) == 0) { 577 // we'll be freed later when the 2MSL timer expires 578 gSocketModule->acquire_socket(socket); 579 } 580 } 581 582 583 /*! Creates and sends a synchronize packet to /a address, and then waits 584 until the connection has been established or refused. 585 */ 586 status_t 587 TCPEndpoint::Connect(const sockaddr* address) 588 { 589 if (!AddressModule()->is_same_family(address)) 590 return EAFNOSUPPORT; 591 592 MutexLocker locker(fLock); 593 594 TRACE("Connect() on address %s", PrintAddress(address)); 595 T(APICall(this, "connect")); 596 597 if (gStackModule->is_restarted_syscall()) { 598 bigtime_t timeout = gStackModule->restore_syscall_restart_timeout(); 599 status_t status = _WaitForEstablished(locker, timeout); 600 TRACE(" Connect(): Connection complete: %s (timeout was %" 601 B_PRIdBIGTIME ")", strerror(status), timeout); 602 return posix_error(status); 603 } 604 605 // Can only call connect() from CLOSED or LISTEN states 606 // otherwise endpoint is considered already connected 607 if (fState == LISTEN) { 608 // this socket is about to connect; remove pending connections in the backlog 609 gSocketModule->set_max_backlog(socket, 0); 610 } else if (fState == ESTABLISHED) { 611 return EISCONN; 612 } else if (fState != CLOSED) 613 return EINPROGRESS; 614 615 // consider destination address INADDR_ANY as INADDR_LOOPBACK 616 sockaddr_storage _address; 617 if (AddressModule()->is_empty_address(address, false)) { 618 AddressModule()->get_loopback_address((sockaddr *)&_address); 619 // for IPv4 and IPv6 the port is at the same offset 620 ((sockaddr_in &)_address).sin_port = ((sockaddr_in *)address)->sin_port; 621 address = (sockaddr *)&_address; 622 } 623 624 status_t status = _PrepareSendPath(address); 625 if (status < B_OK) 626 return status; 627 628 TRACE(" Connect(): starting 3-way handshake..."); 629 630 fState = SYNCHRONIZE_SENT; 631 T(State(this)); 632 633 // send SYN 634 status = _SendQueued(); 635 if (status != B_OK) { 636 _Close(); 637 return status; 638 } 639 640 // If we are running over Loopback, after _SendQueued() returns we 641 // may be in ESTABLISHED already. 642 if (fState == ESTABLISHED) { 643 TRACE(" Connect() completed after _SendQueued()"); 644 return B_OK; 645 } 646 647 // wait until 3-way handshake is complete (if needed) 648 bigtime_t timeout = min_c(socket->send.timeout, TCP_CONNECTION_TIMEOUT); 649 if (timeout == 0) { 650 // we're a non-blocking socket 651 TRACE(" Connect() delayed, return EINPROGRESS"); 652 return EINPROGRESS; 653 } 654 655 bigtime_t absoluteTimeout = absolute_timeout(timeout); 656 gStackModule->store_syscall_restart_timeout(absoluteTimeout); 657 658 status = _WaitForEstablished(locker, absoluteTimeout); 659 TRACE(" Connect(): Connection complete: %s (timeout was %" B_PRIdBIGTIME 660 ")", strerror(status), timeout); 661 return posix_error(status); 662 } 663 664 665 status_t 666 TCPEndpoint::Accept(struct net_socket** _acceptedSocket) 667 { 668 MutexLocker locker(fLock); 669 670 TRACE("Accept()"); 671 T(APICall(this, "accept")); 672 673 status_t status; 674 bigtime_t timeout = absolute_timeout(socket->receive.timeout); 675 if (gStackModule->is_restarted_syscall()) 676 timeout = gStackModule->restore_syscall_restart_timeout(); 677 else 678 gStackModule->store_syscall_restart_timeout(timeout); 679 680 do { 681 locker.Unlock(); 682 683 status = acquire_sem_etc(fAcceptSemaphore, 1, B_ABSOLUTE_TIMEOUT 684 | B_CAN_INTERRUPT, timeout); 685 if (status != B_OK) { 686 if (status == B_TIMED_OUT && socket->receive.timeout == 0) 687 return B_WOULD_BLOCK; 688 689 return status; 690 } 691 692 locker.Lock(); 693 status = gSocketModule->dequeue_connected(socket, _acceptedSocket); 694 #ifdef TRACE_TCP 695 if (status == B_OK) 696 TRACE(" Accept() returning %p", (*_acceptedSocket)->first_protocol); 697 #endif 698 } while (status != B_OK); 699 700 return status; 701 } 702 703 704 status_t 705 TCPEndpoint::Bind(const sockaddr *address) 706 { 707 if (address == NULL) 708 return B_BAD_VALUE; 709 710 MutexLocker lock(fLock); 711 712 TRACE("Bind() on address %s", PrintAddress(address)); 713 T(APICall(this, "bind")); 714 715 if (fState != CLOSED) 716 return EISCONN; 717 718 return fManager->Bind(this, address); 719 } 720 721 722 status_t 723 TCPEndpoint::Unbind(struct sockaddr *address) 724 { 725 MutexLocker _(fLock); 726 727 TRACE("Unbind()"); 728 T(APICall(this, "unbind")); 729 730 return fManager->Unbind(this); 731 } 732 733 734 status_t 735 TCPEndpoint::Listen(int count) 736 { 737 MutexLocker _(fLock); 738 739 TRACE("Listen()"); 740 T(APICall(this, "listen")); 741 742 if (fState != CLOSED && fState != LISTEN) 743 return B_BAD_VALUE; 744 745 if (fState == CLOSED) { 746 fAcceptSemaphore = create_sem(0, "tcp accept"); 747 if (fAcceptSemaphore < B_OK) 748 return ENOBUFS; 749 750 status_t status = fManager->SetPassive(this); 751 if (status != B_OK) { 752 delete_sem(fAcceptSemaphore); 753 fAcceptSemaphore = -1; 754 return status; 755 } 756 } 757 758 gSocketModule->set_max_backlog(socket, count); 759 760 fState = LISTEN; 761 T(State(this)); 762 return B_OK; 763 } 764 765 766 status_t 767 TCPEndpoint::Shutdown(int direction) 768 { 769 MutexLocker lock(fLock); 770 771 TRACE("Shutdown(%i)", direction); 772 T(APICall(this, "shutdown")); 773 774 if (direction == SHUT_RD || direction == SHUT_RDWR) 775 fFlags |= FLAG_NO_RECEIVE; 776 777 if (direction == SHUT_WR || direction == SHUT_RDWR) { 778 // TODO: That's not correct. After read/write shutting down the socket 779 // one should still be able to read previously arrived data. 780 _Disconnect(false); 781 } 782 783 return B_OK; 784 } 785 786 787 /*! Puts data contained in \a buffer into send buffer */ 788 status_t 789 TCPEndpoint::SendData(net_buffer *buffer) 790 { 791 MutexLocker lock(fLock); 792 793 TRACE("SendData(buffer %p, size %" B_PRIu32 ", flags %#" B_PRIx32 794 ") [total %" B_PRIuSIZE " bytes, has %" B_PRIuSIZE "]", buffer, 795 buffer->size, buffer->flags, fSendQueue.Size(), fSendQueue.Free()); 796 T(APICall(this, "senddata")); 797 798 uint32 flags = buffer->flags; 799 800 if (fState == CLOSED) 801 return ENOTCONN; 802 if (fState == LISTEN) 803 return EDESTADDRREQ; 804 if (!is_writable(fState) && !is_establishing(fState)) { 805 // we only send signals when called from userland 806 if (gStackModule->is_syscall() && (flags & MSG_NOSIGNAL) == 0) 807 send_signal(find_thread(NULL), SIGPIPE); 808 return EPIPE; 809 } 810 811 size_t left = buffer->size; 812 813 bigtime_t timeout = absolute_timeout(socket->send.timeout); 814 if (gStackModule->is_restarted_syscall()) 815 timeout = gStackModule->restore_syscall_restart_timeout(); 816 else 817 gStackModule->store_syscall_restart_timeout(timeout); 818 819 while (left > 0) { 820 while (fSendQueue.Free() < socket->send.low_water_mark) { 821 // wait until enough space is available 822 status_t status = _WaitForCondition(fSendCondition, lock, timeout); 823 if (status < B_OK) { 824 TRACE(" SendData() returning %s (%d)", 825 strerror(posix_error(status)), (int)posix_error(status)); 826 return posix_error(status); 827 } 828 829 if (!is_writable(fState) && !is_establishing(fState)) { 830 // we only send signals when called from userland 831 if (gStackModule->is_syscall()) 832 send_signal(find_thread(NULL), SIGPIPE); 833 return EPIPE; 834 } 835 } 836 837 size_t size = fSendQueue.Free(); 838 if (size < left) { 839 // we need to split the original buffer 840 net_buffer* clone = gBufferModule->clone(buffer, false); 841 // TODO: add offset/size parameter to net_buffer::clone() or 842 // even a move_data() function, as this is a bit inefficient 843 if (clone == NULL) 844 return ENOBUFS; 845 846 status_t status = gBufferModule->trim(clone, size); 847 if (status != B_OK) { 848 gBufferModule->free(clone); 849 return status; 850 } 851 852 gBufferModule->remove_header(buffer, size); 853 left -= size; 854 fSendQueue.Add(clone); 855 } else { 856 left -= buffer->size; 857 fSendQueue.Add(buffer); 858 } 859 } 860 861 TRACE(" SendData(): %" B_PRIuSIZE " bytes used.", fSendQueue.Used()); 862 863 bool force = false; 864 if ((flags & MSG_OOB) != 0) { 865 fSendUrgentOffset = fSendQueue.LastSequence(); 866 // RFC 961 specifies that the urgent offset points to the last 867 // byte of urgent data. However, this is commonly implemented as 868 // here, ie. it points to the first byte after the urgent data. 869 force = true; 870 } 871 if ((flags & MSG_EOF) != 0) 872 _Disconnect(false); 873 874 if (fState == ESTABLISHED || fState == FINISH_RECEIVED) 875 _SendQueued(force); 876 877 return B_OK; 878 } 879 880 881 ssize_t 882 TCPEndpoint::SendAvailable() 883 { 884 MutexLocker locker(fLock); 885 886 ssize_t available; 887 888 if (is_writable(fState)) 889 available = fSendQueue.Free(); 890 else if (is_establishing(fState)) 891 available = 0; 892 else 893 available = EPIPE; 894 895 TRACE("SendAvailable(): %" B_PRIdSSIZE, available); 896 T(APICall(this, "sendavailable")); 897 return available; 898 } 899 900 901 status_t 902 TCPEndpoint::FillStat(net_stat *stat) 903 { 904 MutexLocker _(fLock); 905 906 strlcpy(stat->state, name_for_state(fState), sizeof(stat->state)); 907 stat->receive_queue_size = fReceiveQueue.Available(); 908 stat->send_queue_size = fSendQueue.Used(); 909 910 return B_OK; 911 } 912 913 914 status_t 915 TCPEndpoint::ReadData(size_t numBytes, uint32 flags, net_buffer** _buffer) 916 { 917 MutexLocker locker(fLock); 918 919 TRACE("ReadData(%" B_PRIuSIZE " bytes, flags %#" B_PRIx32 ")", numBytes, 920 flags); 921 T(APICall(this, "readdata")); 922 923 *_buffer = NULL; 924 925 if (fState == CLOSED) 926 return ENOTCONN; 927 928 bigtime_t timeout = absolute_timeout(socket->receive.timeout); 929 if (gStackModule->is_restarted_syscall()) 930 timeout = gStackModule->restore_syscall_restart_timeout(); 931 else 932 gStackModule->store_syscall_restart_timeout(timeout); 933 934 if (fState == SYNCHRONIZE_SENT || fState == SYNCHRONIZE_RECEIVED) { 935 if (flags & MSG_DONTWAIT) 936 return B_WOULD_BLOCK; 937 938 status_t status = _WaitForEstablished(locker, timeout); 939 if (status < B_OK) 940 return posix_error(status); 941 } 942 943 size_t dataNeeded = socket->receive.low_water_mark; 944 945 // When MSG_WAITALL is set then the function should block 946 // until the full amount of data can be returned. 947 if (flags & MSG_WAITALL) 948 dataNeeded = numBytes; 949 950 // TODO: add support for urgent data (MSG_OOB) 951 952 while (true) { 953 if (fState == CLOSING || fState == WAIT_FOR_FINISH_ACKNOWLEDGE 954 || fState == TIME_WAIT) { 955 // ``Connection closing''. 956 return B_OK; 957 } 958 959 if (fReceiveQueue.Available() > 0) { 960 if (fReceiveQueue.Available() >= dataNeeded 961 || (fReceiveQueue.PushedData() > 0 962 && fReceiveQueue.PushedData() >= fReceiveQueue.Available())) 963 break; 964 } else if (fState == FINISH_RECEIVED) { 965 // ``If no text is awaiting delivery, the RECEIVE will 966 // get a Connection closing''. 967 return B_OK; 968 } 969 970 if ((flags & MSG_DONTWAIT) != 0 || socket->receive.timeout == 0) 971 return B_WOULD_BLOCK; 972 973 if ((fFlags & FLAG_NO_RECEIVE) != 0) 974 return B_OK; 975 976 status_t status = _WaitForCondition(fReceiveCondition, locker, timeout); 977 if (status < B_OK) { 978 // The Open Group base specification mentions that EINTR should be 979 // returned if the recv() is interrupted before _any data_ is 980 // available. So we actually check if there is data, and if so, 981 // push it to the user. 982 if ((status == B_TIMED_OUT || status == B_INTERRUPTED) 983 && fReceiveQueue.Available() > 0) 984 break; 985 986 return posix_error(status); 987 } 988 } 989 990 TRACE(" ReadData(): %" B_PRIuSIZE " are available.", 991 fReceiveQueue.Available()); 992 993 if (numBytes < fReceiveQueue.Available()) 994 fReceiveCondition.NotifyAll(); 995 996 bool clone = (flags & MSG_PEEK) != 0; 997 998 ssize_t receivedBytes = fReceiveQueue.Get(numBytes, !clone, _buffer); 999 1000 TRACE(" ReadData(): %" B_PRIuSIZE " bytes kept.", 1001 fReceiveQueue.Available()); 1002 1003 // if we are opening the window, check if we should send an ACK 1004 if (!clone) 1005 SendAcknowledge(false); 1006 1007 return receivedBytes; 1008 } 1009 1010 1011 ssize_t 1012 TCPEndpoint::ReadAvailable() 1013 { 1014 MutexLocker locker(fLock); 1015 1016 TRACE("ReadAvailable(): %" B_PRIdSSIZE, _AvailableData()); 1017 T(APICall(this, "readavailable")); 1018 1019 return _AvailableData(); 1020 } 1021 1022 1023 status_t 1024 TCPEndpoint::SetSendBufferSize(size_t length) 1025 { 1026 MutexLocker _(fLock); 1027 fSendQueue.SetMaxBytes(length); 1028 return B_OK; 1029 } 1030 1031 1032 status_t 1033 TCPEndpoint::SetReceiveBufferSize(size_t length) 1034 { 1035 MutexLocker _(fLock); 1036 fReceiveQueue.SetMaxBytes(length); 1037 return B_OK; 1038 } 1039 1040 1041 status_t 1042 TCPEndpoint::GetOption(int option, void* _value, int* _length) 1043 { 1044 if (*_length != sizeof(int)) 1045 return B_BAD_VALUE; 1046 1047 int* value = (int*)_value; 1048 1049 switch (option) { 1050 case TCP_NODELAY: 1051 if ((fOptions & TCP_NODELAY) != 0) 1052 *value = 1; 1053 else 1054 *value = 0; 1055 return B_OK; 1056 1057 case TCP_MAXSEG: 1058 *value = fReceiveMaxSegmentSize; 1059 return B_OK; 1060 1061 default: 1062 return B_BAD_VALUE; 1063 } 1064 } 1065 1066 1067 status_t 1068 TCPEndpoint::SetOption(int option, const void* _value, int length) 1069 { 1070 if (option != TCP_NODELAY) 1071 return B_BAD_VALUE; 1072 1073 if (length != sizeof(int)) 1074 return B_BAD_VALUE; 1075 1076 const int* value = (const int*)_value; 1077 1078 MutexLocker _(fLock); 1079 if (*value) 1080 fOptions |= TCP_NODELAY; 1081 else 1082 fOptions &= ~TCP_NODELAY; 1083 1084 return B_OK; 1085 } 1086 1087 1088 // #pragma mark - misc 1089 1090 1091 bool 1092 TCPEndpoint::IsBound() const 1093 { 1094 return !LocalAddress().IsEmpty(true); 1095 } 1096 1097 1098 bool 1099 TCPEndpoint::IsLocal() const 1100 { 1101 return (fFlags & FLAG_LOCAL) != 0; 1102 } 1103 1104 1105 status_t 1106 TCPEndpoint::DelayedAcknowledge() 1107 { 1108 if (gStackModule->cancel_timer(&fDelayedAcknowledgeTimer)) { 1109 // timer was active, send an ACK now (with the exception above, 1110 // we send every other ACK) 1111 T(TimerSet(this, "delayed ack", -1)); 1112 return SendAcknowledge(true); 1113 } 1114 1115 gStackModule->set_timer(&fDelayedAcknowledgeTimer, 1116 TCP_DELAYED_ACKNOWLEDGE_TIMEOUT); 1117 T(TimerSet(this, "delayed ack", TCP_DELAYED_ACKNOWLEDGE_TIMEOUT)); 1118 return B_OK; 1119 } 1120 1121 1122 status_t 1123 TCPEndpoint::SendAcknowledge(bool force) 1124 { 1125 return _SendQueued(force, 0); 1126 } 1127 1128 1129 void 1130 TCPEndpoint::_StartPersistTimer() 1131 { 1132 gStackModule->set_timer(&fPersistTimer, TCP_PERSIST_TIMEOUT); 1133 T(TimerSet(this, "persist", TCP_PERSIST_TIMEOUT)); 1134 } 1135 1136 1137 void 1138 TCPEndpoint::_EnterTimeWait() 1139 { 1140 TRACE("_EnterTimeWait()"); 1141 1142 if (fState == TIME_WAIT) { 1143 _CancelConnectionTimers(); 1144 1145 if (IsLocal()) { 1146 // we do not use TIME_WAIT state for local connections 1147 fFlags |= FLAG_DELETE_ON_CLOSE; 1148 return; 1149 } 1150 } 1151 1152 _UpdateTimeWait(); 1153 } 1154 1155 1156 void 1157 TCPEndpoint::_UpdateTimeWait() 1158 { 1159 gStackModule->set_timer(&fTimeWaitTimer, TCP_MAX_SEGMENT_LIFETIME << 1); 1160 T(TimerSet(this, "time-wait", TCP_MAX_SEGMENT_LIFETIME << 1)); 1161 } 1162 1163 1164 void 1165 TCPEndpoint::_CancelConnectionTimers() 1166 { 1167 gStackModule->cancel_timer(&fRetransmitTimer); 1168 T(TimerSet(this, "retransmit", -1)); 1169 gStackModule->cancel_timer(&fPersistTimer); 1170 T(TimerSet(this, "persist", -1)); 1171 gStackModule->cancel_timer(&fDelayedAcknowledgeTimer); 1172 T(TimerSet(this, "delayed ack", -1)); 1173 } 1174 1175 1176 /*! Sends the FIN flag to the peer when the connection is still open. 1177 Moves the endpoint to the next state depending on where it was. 1178 */ 1179 status_t 1180 TCPEndpoint::_Disconnect(bool closing) 1181 { 1182 tcp_state previousState = fState; 1183 1184 if (fState == SYNCHRONIZE_RECEIVED || fState == ESTABLISHED) 1185 fState = FINISH_SENT; 1186 else if (fState == FINISH_RECEIVED) 1187 fState = WAIT_FOR_FINISH_ACKNOWLEDGE; 1188 else 1189 return B_OK; 1190 1191 T(State(this)); 1192 1193 status_t status = _SendQueued(); 1194 if (status != B_OK) { 1195 fState = previousState; 1196 T(State(this)); 1197 return status; 1198 } 1199 1200 return B_OK; 1201 } 1202 1203 1204 void 1205 TCPEndpoint::_MarkEstablished() 1206 { 1207 fState = ESTABLISHED; 1208 T(State(this)); 1209 1210 if (gSocketModule->has_parent(socket)) { 1211 gSocketModule->set_connected(socket); 1212 release_sem_etc(fAcceptSemaphore, 1, B_DO_NOT_RESCHEDULE); 1213 } 1214 1215 fSendCondition.NotifyAll(); 1216 gSocketModule->notify(socket, B_SELECT_WRITE, fSendQueue.Free()); 1217 } 1218 1219 1220 status_t 1221 TCPEndpoint::_WaitForEstablished(MutexLocker &locker, bigtime_t timeout) 1222 { 1223 // TODO: Checking for CLOSED seems correct, but breaks several neon tests. 1224 // When investigating this, also have a look at _Close() and _HandleReset(). 1225 while (fState < ESTABLISHED/* && fState != CLOSED*/) { 1226 if (socket->error != B_OK) 1227 return socket->error; 1228 1229 status_t status = _WaitForCondition(fSendCondition, locker, timeout); 1230 if (status < B_OK) 1231 return status; 1232 } 1233 1234 return B_OK; 1235 } 1236 1237 1238 // #pragma mark - receive 1239 1240 1241 void 1242 TCPEndpoint::_Close() 1243 { 1244 _CancelConnectionTimers(); 1245 fState = CLOSED; 1246 T(State(this)); 1247 1248 fFlags |= FLAG_DELETE_ON_CLOSE; 1249 1250 fSendCondition.NotifyAll(); 1251 _NotifyReader(); 1252 1253 if (gSocketModule->has_parent(socket)) { 1254 // We still have a parent - obviously, we haven't been accepted yet, 1255 // so no one could ever close us. 1256 _CancelConnectionTimers(); 1257 gSocketModule->set_aborted(socket); 1258 } 1259 } 1260 1261 1262 void 1263 TCPEndpoint::_HandleReset(status_t error) 1264 { 1265 socket->error = error; 1266 _Close(); 1267 1268 gSocketModule->notify(socket, B_SELECT_WRITE, error); 1269 gSocketModule->notify(socket, B_SELECT_ERROR, error); 1270 } 1271 1272 1273 void 1274 TCPEndpoint::_DuplicateAcknowledge(tcp_segment_header &segment) 1275 { 1276 if (++fDuplicateAcknowledgeCount < 3) 1277 return; 1278 1279 if (fDuplicateAcknowledgeCount == 3) { 1280 _ResetSlowStart(); 1281 fCongestionWindow = fSlowStartThreshold + 3 * fSendMaxSegmentSize; 1282 fSendNext = segment.acknowledge; 1283 } else if (fDuplicateAcknowledgeCount > 3) 1284 fCongestionWindow += fSendMaxSegmentSize; 1285 1286 _SendQueued(); 1287 } 1288 1289 1290 void 1291 TCPEndpoint::_UpdateTimestamps(tcp_segment_header& segment, 1292 size_t segmentLength) 1293 { 1294 if (fFlags & FLAG_OPTION_TIMESTAMP) { 1295 tcp_sequence sequence(segment.sequence); 1296 1297 if (fLastAcknowledgeSent >= sequence 1298 && fLastAcknowledgeSent < (sequence + segmentLength)) 1299 fReceivedTimestamp = segment.timestamp_value; 1300 } 1301 } 1302 1303 1304 ssize_t 1305 TCPEndpoint::_AvailableData() const 1306 { 1307 // TODO: Refer to the FLAG_NO_RECEIVE comment above regarding 1308 // the application of FLAG_NO_RECEIVE in listen()ing 1309 // sockets. 1310 if (fState == LISTEN) 1311 return gSocketModule->count_connected(socket); 1312 if (fState == SYNCHRONIZE_SENT) 1313 return 0; 1314 1315 ssize_t availableData = fReceiveQueue.Available(); 1316 1317 if (availableData == 0 && !_ShouldReceive()) 1318 return ENOTCONN; 1319 1320 return availableData; 1321 } 1322 1323 1324 void 1325 TCPEndpoint::_NotifyReader() 1326 { 1327 fReceiveCondition.NotifyAll(); 1328 gSocketModule->notify(socket, B_SELECT_READ, _AvailableData()); 1329 } 1330 1331 1332 bool 1333 TCPEndpoint::_AddData(tcp_segment_header& segment, net_buffer* buffer) 1334 { 1335 if ((segment.flags & TCP_FLAG_FINISH) != 0) { 1336 // Remember the position of the finish received flag 1337 fFinishReceived = true; 1338 fFinishReceivedAt = segment.sequence + buffer->size; 1339 } 1340 1341 fReceiveQueue.Add(buffer, segment.sequence); 1342 fReceiveNext = fReceiveQueue.NextSequence(); 1343 1344 if (fFinishReceived) { 1345 // Set or reset the finish flag on the current segment 1346 if (fReceiveNext < fFinishReceivedAt) 1347 segment.flags &= ~TCP_FLAG_FINISH; 1348 else 1349 segment.flags |= TCP_FLAG_FINISH; 1350 } 1351 1352 TRACE(" _AddData(): adding data, receive next = %" B_PRIu32 ". Now have %" 1353 B_PRIuSIZE " bytes.", fReceiveNext.Number(), fReceiveQueue.Available()); 1354 1355 if ((segment.flags & TCP_FLAG_PUSH) != 0) 1356 fReceiveQueue.SetPushPointer(); 1357 1358 return fReceiveQueue.Available() > 0; 1359 } 1360 1361 1362 void 1363 TCPEndpoint::_PrepareReceivePath(tcp_segment_header& segment) 1364 { 1365 fInitialReceiveSequence = segment.sequence; 1366 fFinishReceived = false; 1367 1368 // count the received SYN 1369 segment.sequence++; 1370 1371 fReceiveNext = segment.sequence; 1372 fReceiveQueue.SetInitialSequence(segment.sequence); 1373 1374 if ((fOptions & TCP_NOOPT) == 0) { 1375 if (segment.max_segment_size > 0) 1376 fSendMaxSegmentSize = segment.max_segment_size; 1377 1378 if (segment.options & TCP_HAS_WINDOW_SCALE) { 1379 fFlags |= FLAG_OPTION_WINDOW_SCALE; 1380 fSendWindowShift = segment.window_shift; 1381 } else { 1382 fFlags &= ~FLAG_OPTION_WINDOW_SCALE; 1383 fReceiveWindowShift = 0; 1384 } 1385 1386 if (segment.options & TCP_HAS_TIMESTAMPS) { 1387 fFlags |= FLAG_OPTION_TIMESTAMP; 1388 fReceivedTimestamp = segment.timestamp_value; 1389 } else 1390 fFlags &= ~FLAG_OPTION_TIMESTAMP; 1391 } 1392 1393 fCongestionWindow = 2 * fSendMaxSegmentSize; 1394 fSlowStartThreshold = (uint32)segment.advertised_window << fSendWindowShift; 1395 } 1396 1397 1398 bool 1399 TCPEndpoint::_ShouldReceive() const 1400 { 1401 if ((fFlags & FLAG_NO_RECEIVE) != 0) 1402 return false; 1403 1404 return fState == ESTABLISHED || fState == FINISH_SENT 1405 || fState == FINISH_ACKNOWLEDGED; 1406 } 1407 1408 1409 int32 1410 TCPEndpoint::_Spawn(TCPEndpoint* parent, tcp_segment_header& segment, 1411 net_buffer* buffer) 1412 { 1413 MutexLocker _(fLock); 1414 1415 // TODO error checking 1416 ProtocolSocket::Open(); 1417 1418 fState = SYNCHRONIZE_RECEIVED; 1419 T(Spawn(parent, this)); 1420 1421 fManager = parent->fManager; 1422 1423 LocalAddress().SetTo(buffer->destination); 1424 PeerAddress().SetTo(buffer->source); 1425 1426 TRACE("Spawn()"); 1427 1428 // TODO: proper error handling! 1429 if (fManager->BindChild(this) != B_OK) { 1430 T(Error(this, "binding failed", __LINE__)); 1431 return DROP; 1432 } 1433 if (_PrepareSendPath(*PeerAddress()) != B_OK) { 1434 T(Error(this, "prepare send faild", __LINE__)); 1435 return DROP; 1436 } 1437 1438 fOptions = parent->fOptions; 1439 fAcceptSemaphore = parent->fAcceptSemaphore; 1440 1441 _PrepareReceivePath(segment); 1442 1443 // send SYN+ACK 1444 if (_SendQueued() != B_OK) { 1445 T(Error(this, "sending failed", __LINE__)); 1446 return DROP; 1447 } 1448 1449 segment.flags &= ~TCP_FLAG_SYNCHRONIZE; 1450 // we handled this flag now, it must not be set for further processing 1451 1452 return _Receive(segment, buffer); 1453 } 1454 1455 1456 int32 1457 TCPEndpoint::_ListenReceive(tcp_segment_header& segment, net_buffer* buffer) 1458 { 1459 TRACE("ListenReceive()"); 1460 1461 // Essentially, we accept only TCP_FLAG_SYNCHRONIZE in this state, 1462 // but the error behaviour differs 1463 if (segment.flags & TCP_FLAG_RESET) 1464 return DROP; 1465 if (segment.flags & TCP_FLAG_ACKNOWLEDGE) 1466 return DROP | RESET; 1467 if ((segment.flags & TCP_FLAG_SYNCHRONIZE) == 0) 1468 return DROP; 1469 1470 // TODO: drop broadcast/multicast 1471 1472 // spawn new endpoint for accept() 1473 net_socket* newSocket; 1474 if (gSocketModule->spawn_pending_socket(socket, &newSocket) < B_OK) { 1475 T(Error(this, "spawning failed", __LINE__)); 1476 return DROP; 1477 } 1478 1479 return ((TCPEndpoint *)newSocket->first_protocol)->_Spawn(this, 1480 segment, buffer); 1481 } 1482 1483 1484 int32 1485 TCPEndpoint::_SynchronizeSentReceive(tcp_segment_header &segment, 1486 net_buffer *buffer) 1487 { 1488 TRACE("_SynchronizeSentReceive()"); 1489 1490 if ((segment.flags & TCP_FLAG_ACKNOWLEDGE) != 0 1491 && (fInitialSendSequence >= segment.acknowledge 1492 || fSendMax < segment.acknowledge)) 1493 return DROP | RESET; 1494 1495 if (segment.flags & TCP_FLAG_RESET) { 1496 _HandleReset(ECONNREFUSED); 1497 return DROP; 1498 } 1499 1500 if ((segment.flags & TCP_FLAG_SYNCHRONIZE) == 0) 1501 return DROP; 1502 1503 fSendUnacknowledged = segment.acknowledge; 1504 _PrepareReceivePath(segment); 1505 1506 if (segment.flags & TCP_FLAG_ACKNOWLEDGE) { 1507 _MarkEstablished(); 1508 } else { 1509 // simultaneous open 1510 fState = SYNCHRONIZE_RECEIVED; 1511 T(State(this)); 1512 } 1513 1514 segment.flags &= ~TCP_FLAG_SYNCHRONIZE; 1515 // we handled this flag now, it must not be set for further processing 1516 1517 return _Receive(segment, buffer) | IMMEDIATE_ACKNOWLEDGE; 1518 } 1519 1520 1521 int32 1522 TCPEndpoint::_Receive(tcp_segment_header& segment, net_buffer* buffer) 1523 { 1524 uint32 advertisedWindow = (uint32)segment.advertised_window 1525 << fSendWindowShift; 1526 size_t segmentLength = buffer->size; 1527 1528 // First, handle the most common case for uni-directional data transfer 1529 // (known as header prediction - the segment must not change the window, 1530 // and must be the expected sequence, and contain no control flags) 1531 1532 if (fState == ESTABLISHED 1533 && segment.AcknowledgeOnly() 1534 && fReceiveNext == segment.sequence 1535 && advertisedWindow > 0 && advertisedWindow == fSendWindow 1536 && fSendNext == fSendMax) { 1537 _UpdateTimestamps(segment, segmentLength); 1538 1539 if (segmentLength == 0) { 1540 // this is a pure acknowledge segment - we're on the sending end 1541 if (fSendUnacknowledged < segment.acknowledge 1542 && fSendMax >= segment.acknowledge) { 1543 _Acknowledged(segment); 1544 return DROP; 1545 } 1546 } else if (segment.acknowledge == fSendUnacknowledged 1547 && fReceiveQueue.IsContiguous() 1548 && fReceiveQueue.Free() >= segmentLength 1549 && (fFlags & FLAG_NO_RECEIVE) == 0) { 1550 if (_AddData(segment, buffer)) 1551 _NotifyReader(); 1552 1553 return KEEP | ((segment.flags & TCP_FLAG_PUSH) != 0 1554 ? IMMEDIATE_ACKNOWLEDGE : ACKNOWLEDGE); 1555 } 1556 } 1557 1558 // The fast path was not applicable, so we continue with the standard 1559 // processing of the incoming segment 1560 1561 ASSERT(fState != SYNCHRONIZE_SENT && fState != LISTEN); 1562 1563 if (fState != CLOSED && fState != TIME_WAIT) { 1564 // Check sequence number 1565 if (!segment_in_sequence(segment, segmentLength, fReceiveNext, 1566 fReceiveWindow)) { 1567 TRACE(" Receive(): segment out of window, next: %" B_PRIu32 1568 " wnd: %" B_PRIu32, fReceiveNext.Number(), fReceiveWindow); 1569 if ((segment.flags & TCP_FLAG_RESET) != 0) { 1570 // TODO: this doesn't look right - review! 1571 return DROP; 1572 } 1573 return DROP | IMMEDIATE_ACKNOWLEDGE; 1574 } 1575 } 1576 1577 if ((segment.flags & TCP_FLAG_RESET) != 0) { 1578 // Is this a valid reset? 1579 // We generally ignore resets in time wait state (see RFC 1337) 1580 if (fLastAcknowledgeSent <= segment.sequence 1581 && tcp_sequence(segment.sequence) < (fLastAcknowledgeSent 1582 + fReceiveWindow) 1583 && fState != TIME_WAIT) { 1584 status_t error; 1585 if (fState == SYNCHRONIZE_RECEIVED) 1586 error = ECONNREFUSED; 1587 else if (fState == CLOSING || fState == WAIT_FOR_FINISH_ACKNOWLEDGE) 1588 error = ENOTCONN; 1589 else 1590 error = ECONNRESET; 1591 1592 _HandleReset(error); 1593 } 1594 1595 return DROP; 1596 } 1597 1598 if ((segment.flags & TCP_FLAG_SYNCHRONIZE) != 0 1599 || (fState == SYNCHRONIZE_RECEIVED 1600 && (fInitialReceiveSequence > segment.sequence 1601 || ((segment.flags & TCP_FLAG_ACKNOWLEDGE) != 0 1602 && (fSendUnacknowledged > segment.acknowledge 1603 || fSendMax < segment.acknowledge))))) { 1604 // reset the connection - either the initial SYN was faulty, or we 1605 // received a SYN within the data stream 1606 return DROP | RESET; 1607 } 1608 1609 // TODO: Check this! Why do we advertize a window outside of what we should 1610 // buffer? 1611 fReceiveWindow = max_c(fReceiveQueue.Free(), fReceiveWindow); 1612 // the window must not shrink 1613 1614 // trim buffer to be within the receive window 1615 int32 drop = (int32)(fReceiveNext - segment.sequence).Number(); 1616 if (drop > 0) { 1617 if ((uint32)drop > buffer->size 1618 || ((uint32)drop == buffer->size 1619 && (segment.flags & TCP_FLAG_FINISH) == 0)) { 1620 // don't accidently remove a FIN we shouldn't remove 1621 segment.flags &= ~TCP_FLAG_FINISH; 1622 drop = buffer->size; 1623 } 1624 1625 // remove duplicate data at the start 1626 TRACE("* remove %" B_PRId32 " bytes from the start", drop); 1627 gBufferModule->remove_header(buffer, drop); 1628 segment.sequence += drop; 1629 } 1630 1631 int32 action = KEEP; 1632 1633 drop = (int32)(segment.sequence + buffer->size 1634 - (fReceiveNext + fReceiveWindow)).Number(); 1635 if (drop > 0) { 1636 // remove data exceeding our window 1637 if ((uint32)drop >= buffer->size) { 1638 // if we can accept data, or the segment is not what we'd expect, 1639 // drop the segment (an immediate acknowledge is always triggered) 1640 if (fReceiveWindow != 0 || segment.sequence != fReceiveNext) 1641 return DROP | IMMEDIATE_ACKNOWLEDGE; 1642 1643 action |= IMMEDIATE_ACKNOWLEDGE; 1644 } 1645 1646 if ((segment.flags & TCP_FLAG_FINISH) != 0) { 1647 // we need to remove the finish, too, as part of the data 1648 drop--; 1649 } 1650 1651 segment.flags &= ~(TCP_FLAG_FINISH | TCP_FLAG_PUSH); 1652 TRACE("* remove %" B_PRId32 " bytes from the end", drop); 1653 gBufferModule->remove_trailer(buffer, drop); 1654 } 1655 1656 #ifdef TRACE_TCP 1657 if (advertisedWindow > fSendWindow) { 1658 TRACE(" Receive(): Window update %" B_PRIu32 " -> %" B_PRIu32, 1659 fSendWindow, advertisedWindow); 1660 } 1661 #endif 1662 1663 fSendWindow = advertisedWindow; 1664 if (advertisedWindow > fSendMaxWindow) 1665 fSendMaxWindow = advertisedWindow; 1666 1667 // Then look at the acknowledgement for any updates 1668 1669 if ((segment.flags & TCP_FLAG_ACKNOWLEDGE) != 0) { 1670 // process acknowledged data 1671 if (fState == SYNCHRONIZE_RECEIVED) 1672 _MarkEstablished(); 1673 1674 if (fSendMax < segment.acknowledge) 1675 return DROP | IMMEDIATE_ACKNOWLEDGE; 1676 1677 if (segment.acknowledge < fSendUnacknowledged) { 1678 if (buffer->size == 0 && advertisedWindow == fSendWindow 1679 && (segment.flags & TCP_FLAG_FINISH) == 0) { 1680 TRACE("Receive(): duplicate ack!"); 1681 1682 _DuplicateAcknowledge(segment); 1683 } 1684 1685 return DROP; 1686 } else { 1687 // this segment acknowledges in flight data 1688 1689 if (fDuplicateAcknowledgeCount >= 3) { 1690 // deflate the window. 1691 fCongestionWindow = fSlowStartThreshold; 1692 } 1693 1694 fDuplicateAcknowledgeCount = 0; 1695 1696 if (fSendMax == segment.acknowledge) 1697 TRACE("Receive(): all inflight data ack'd!"); 1698 1699 if (segment.acknowledge > fSendQueue.LastSequence() 1700 && fState > ESTABLISHED) { 1701 TRACE("Receive(): FIN has been acknowledged!"); 1702 1703 switch (fState) { 1704 case FINISH_SENT: 1705 fState = FINISH_ACKNOWLEDGED; 1706 T(State(this)); 1707 break; 1708 case CLOSING: 1709 fState = TIME_WAIT; 1710 T(State(this)); 1711 _EnterTimeWait(); 1712 return DROP; 1713 case WAIT_FOR_FINISH_ACKNOWLEDGE: 1714 _Close(); 1715 break; 1716 1717 default: 1718 break; 1719 } 1720 } 1721 1722 if (fState != CLOSED) 1723 _Acknowledged(segment); 1724 } 1725 } 1726 1727 if (segment.flags & TCP_FLAG_URGENT) { 1728 if (fState == ESTABLISHED || fState == FINISH_SENT 1729 || fState == FINISH_ACKNOWLEDGED) { 1730 // TODO: Handle urgent data: 1731 // - RCV.UP <- max(RCV.UP, SEG.UP) 1732 // - signal the user that urgent data is available (SIGURG) 1733 } 1734 } 1735 1736 bool notify = false; 1737 1738 // The buffer may be freed if its data is added to the queue, so cache 1739 // the size as we still need it later. 1740 uint32 bufferSize = buffer->size; 1741 1742 if ((bufferSize > 0 || (segment.flags & TCP_FLAG_FINISH) != 0) 1743 && _ShouldReceive()) 1744 notify = _AddData(segment, buffer); 1745 else { 1746 if ((fFlags & FLAG_NO_RECEIVE) != 0) 1747 fReceiveNext += buffer->size; 1748 1749 action = (action & ~KEEP) | DROP; 1750 } 1751 1752 if ((segment.flags & TCP_FLAG_FINISH) != 0) { 1753 segmentLength++; 1754 if (fState != CLOSED && fState != LISTEN && fState != SYNCHRONIZE_SENT) { 1755 TRACE("Receive(): peer is finishing connection!"); 1756 fReceiveNext++; 1757 notify = true; 1758 1759 // FIN implies PUSH 1760 fReceiveQueue.SetPushPointer(); 1761 1762 // we'll reply immediately to the FIN if we are not 1763 // transitioning to TIME WAIT so we immediatly ACK it. 1764 action |= IMMEDIATE_ACKNOWLEDGE; 1765 1766 // other side is closing connection; change states 1767 switch (fState) { 1768 case ESTABLISHED: 1769 case SYNCHRONIZE_RECEIVED: 1770 fState = FINISH_RECEIVED; 1771 T(State(this)); 1772 break; 1773 case FINISH_SENT: 1774 // simultaneous close 1775 fState = CLOSING; 1776 T(State(this)); 1777 break; 1778 case FINISH_ACKNOWLEDGED: 1779 fState = TIME_WAIT; 1780 T(State(this)); 1781 _EnterTimeWait(); 1782 break; 1783 case TIME_WAIT: 1784 _UpdateTimeWait(); 1785 break; 1786 1787 default: 1788 break; 1789 } 1790 } 1791 } 1792 1793 if (notify) 1794 _NotifyReader(); 1795 1796 if (bufferSize > 0 || (segment.flags & TCP_FLAG_SYNCHRONIZE) != 0) 1797 action |= ACKNOWLEDGE; 1798 1799 _UpdateTimestamps(segment, segmentLength); 1800 1801 TRACE("Receive() Action %" B_PRId32, action); 1802 1803 return action; 1804 } 1805 1806 1807 int32 1808 TCPEndpoint::SegmentReceived(tcp_segment_header& segment, net_buffer* buffer) 1809 { 1810 MutexLocker locker(fLock); 1811 1812 TRACE("SegmentReceived(): buffer %p (%" B_PRIu32 " bytes) address %s " 1813 "to %s flags %#" B_PRIx8 ", seq %" B_PRIu32 ", ack %" B_PRIu32 1814 ", wnd %" B_PRIu32, buffer, buffer->size, PrintAddress(buffer->source), 1815 PrintAddress(buffer->destination), segment.flags, segment.sequence, 1816 segment.acknowledge, 1817 (uint32)segment.advertised_window << fSendWindowShift); 1818 T(Receive(this, segment, 1819 (uint32)segment.advertised_window << fSendWindowShift, buffer)); 1820 int32 segmentAction = DROP; 1821 1822 switch (fState) { 1823 case LISTEN: 1824 segmentAction = _ListenReceive(segment, buffer); 1825 break; 1826 1827 case SYNCHRONIZE_SENT: 1828 segmentAction = _SynchronizeSentReceive(segment, buffer); 1829 break; 1830 1831 case SYNCHRONIZE_RECEIVED: 1832 case ESTABLISHED: 1833 case FINISH_RECEIVED: 1834 case WAIT_FOR_FINISH_ACKNOWLEDGE: 1835 case FINISH_SENT: 1836 case FINISH_ACKNOWLEDGED: 1837 case CLOSING: 1838 case TIME_WAIT: 1839 case CLOSED: 1840 segmentAction = _Receive(segment, buffer); 1841 break; 1842 } 1843 1844 // process acknowledge action as asked for by the *Receive() method 1845 if (segmentAction & IMMEDIATE_ACKNOWLEDGE) 1846 SendAcknowledge(true); 1847 else if (segmentAction & ACKNOWLEDGE) 1848 DelayedAcknowledge(); 1849 1850 if ((fFlags & (FLAG_CLOSED | FLAG_DELETE_ON_CLOSE)) 1851 == (FLAG_CLOSED | FLAG_DELETE_ON_CLOSE)) { 1852 locker.Unlock(); 1853 gSocketModule->release_socket(socket); 1854 } 1855 1856 return segmentAction; 1857 } 1858 1859 1860 // #pragma mark - send 1861 1862 1863 inline uint8 1864 TCPEndpoint::_CurrentFlags() 1865 { 1866 // we don't set FLAG_FINISH here, instead we do it 1867 // conditionally below depending if we are sending 1868 // the last bytes of the send queue. 1869 1870 switch (fState) { 1871 case CLOSED: 1872 return TCP_FLAG_RESET | TCP_FLAG_ACKNOWLEDGE; 1873 1874 case SYNCHRONIZE_SENT: 1875 return TCP_FLAG_SYNCHRONIZE; 1876 case SYNCHRONIZE_RECEIVED: 1877 return TCP_FLAG_SYNCHRONIZE | TCP_FLAG_ACKNOWLEDGE; 1878 1879 case ESTABLISHED: 1880 case FINISH_RECEIVED: 1881 case FINISH_ACKNOWLEDGED: 1882 case TIME_WAIT: 1883 case WAIT_FOR_FINISH_ACKNOWLEDGE: 1884 case FINISH_SENT: 1885 case CLOSING: 1886 return TCP_FLAG_ACKNOWLEDGE; 1887 1888 default: 1889 return 0; 1890 } 1891 } 1892 1893 1894 inline bool 1895 TCPEndpoint::_ShouldSendSegment(tcp_segment_header& segment, uint32 length, 1896 uint32 segmentMaxSize, uint32 flightSize) 1897 { 1898 if (length > 0) { 1899 // Avoid the silly window syndrome - we only send a segment in case: 1900 // - we have a full segment to send, or 1901 // - we're at the end of our buffer queue, or 1902 // - the buffer is at least larger than half of the maximum send window, 1903 // or 1904 // - we're retransmitting data 1905 if (length == segmentMaxSize 1906 || (fOptions & TCP_NODELAY) != 0 1907 || tcp_sequence(fSendNext + length) == fSendQueue.LastSequence() 1908 || (fSendMaxWindow > 0 && length >= fSendMaxWindow / 2)) 1909 return true; 1910 } 1911 1912 // check if we need to send a window update to the peer 1913 if (segment.advertised_window > 0) { 1914 // correct the window to take into account what already has been advertised 1915 uint32 window = (segment.advertised_window << fReceiveWindowShift) 1916 - (fReceiveMaxAdvertised - fReceiveNext).Number(); 1917 1918 // if we can advertise a window larger than twice the maximum segment 1919 // size, or half the maximum buffer size we send a window update 1920 if (window >= (fReceiveMaxSegmentSize << 1) 1921 || window >= (socket->receive.buffer_size >> 1)) 1922 return true; 1923 } 1924 1925 if ((segment.flags & (TCP_FLAG_SYNCHRONIZE | TCP_FLAG_FINISH 1926 | TCP_FLAG_RESET)) != 0) 1927 return true; 1928 1929 // We do have urgent data pending 1930 if (fSendUrgentOffset > fSendNext) 1931 return true; 1932 1933 // there is no reason to send a segment just now 1934 return false; 1935 } 1936 1937 1938 status_t 1939 TCPEndpoint::_SendQueued(bool force) 1940 { 1941 return _SendQueued(force, fSendWindow); 1942 } 1943 1944 1945 /*! Sends one or more TCP segments with the data waiting in the queue, or some 1946 specific flags that need to be sent. 1947 */ 1948 status_t 1949 TCPEndpoint::_SendQueued(bool force, uint32 sendWindow) 1950 { 1951 if (fRoute == NULL) 1952 return B_ERROR; 1953 1954 // in passive state? 1955 if (fState == LISTEN) 1956 return B_ERROR; 1957 1958 tcp_segment_header segment(_CurrentFlags()); 1959 1960 if ((fOptions & TCP_NOOPT) == 0) { 1961 if ((fFlags & FLAG_OPTION_TIMESTAMP) != 0) { 1962 segment.options |= TCP_HAS_TIMESTAMPS; 1963 segment.timestamp_reply = fReceivedTimestamp; 1964 segment.timestamp_value = tcp_now(); 1965 } 1966 1967 if ((segment.flags & TCP_FLAG_SYNCHRONIZE) != 0 1968 && fSendNext == fInitialSendSequence) { 1969 // add connection establishment options 1970 segment.max_segment_size = fReceiveMaxSegmentSize; 1971 if (fFlags & FLAG_OPTION_WINDOW_SCALE) { 1972 segment.options |= TCP_HAS_WINDOW_SCALE; 1973 segment.window_shift = fReceiveWindowShift; 1974 } 1975 } 1976 } 1977 1978 size_t availableBytes = fReceiveQueue.Free(); 1979 if (fFlags & FLAG_OPTION_WINDOW_SCALE) 1980 segment.advertised_window = availableBytes >> fReceiveWindowShift; 1981 else 1982 segment.advertised_window = min_c(TCP_MAX_WINDOW, availableBytes); 1983 1984 segment.acknowledge = fReceiveNext.Number(); 1985 1986 // Process urgent data 1987 if (fSendUrgentOffset > fSendNext) { 1988 segment.flags |= TCP_FLAG_URGENT; 1989 segment.urgent_offset = (fSendUrgentOffset - fSendNext).Number(); 1990 } else { 1991 fSendUrgentOffset = fSendUnacknowledged.Number(); 1992 // Keep urgent offset updated, so that it doesn't reach into our 1993 // send window on overlap 1994 segment.urgent_offset = 0; 1995 } 1996 1997 if (fCongestionWindow > 0 && fCongestionWindow < sendWindow) 1998 sendWindow = fCongestionWindow; 1999 2000 // fSendUnacknowledged 2001 // | fSendNext fSendMax 2002 // | | | 2003 // v v v 2004 // ----------------------------------- 2005 // | effective window | 2006 // ----------------------------------- 2007 2008 // Flight size represents the window of data which is currently in the 2009 // ether. We should never send data such as the flight size becomes larger 2010 // than the effective window. Note however that the effective window may be 2011 // reduced (by congestion for instance), so at some point in time flight 2012 // size may be larger than the currently calculated window. 2013 2014 uint32 flightSize = (fSendMax - fSendUnacknowledged).Number(); 2015 uint32 consumedWindow = (fSendNext - fSendUnacknowledged).Number(); 2016 2017 if (consumedWindow > sendWindow) { 2018 sendWindow = 0; 2019 // TODO: enter persist state? try to get a window update. 2020 } else 2021 sendWindow -= consumedWindow; 2022 2023 if (force && sendWindow == 0 && fSendNext <= fSendQueue.LastSequence()) { 2024 // send one byte of data to ask for a window update 2025 // (triggered by the persist timer) 2026 sendWindow = 1; 2027 } 2028 2029 uint32 length = min_c(fSendQueue.Available(fSendNext), sendWindow); 2030 bool shouldStartRetransmitTimer = fSendNext == fSendUnacknowledged; 2031 bool retransmit = fSendNext < fSendMax; 2032 2033 do { 2034 uint32 segmentMaxSize = fSendMaxSegmentSize 2035 - tcp_options_length(segment); 2036 uint32 segmentLength = min_c(length, segmentMaxSize); 2037 2038 if (fSendNext + segmentLength == fSendQueue.LastSequence()) { 2039 if (state_needs_finish(fState)) 2040 segment.flags |= TCP_FLAG_FINISH; 2041 if (length > 0) 2042 segment.flags |= TCP_FLAG_PUSH; 2043 } 2044 2045 // Determine if we should really send this segment 2046 if (!force && !retransmit && !_ShouldSendSegment(segment, segmentLength, 2047 segmentMaxSize, flightSize)) { 2048 if (fSendQueue.Available() 2049 && !gStackModule->is_timer_active(&fPersistTimer) 2050 && !gStackModule->is_timer_active(&fRetransmitTimer)) 2051 _StartPersistTimer(); 2052 break; 2053 } 2054 2055 net_buffer *buffer = gBufferModule->create(256); 2056 if (buffer == NULL) 2057 return B_NO_MEMORY; 2058 2059 status_t status = B_OK; 2060 if (segmentLength > 0) 2061 status = fSendQueue.Get(buffer, fSendNext, segmentLength); 2062 if (status < B_OK) { 2063 gBufferModule->free(buffer); 2064 return status; 2065 } 2066 2067 LocalAddress().CopyTo(buffer->source); 2068 PeerAddress().CopyTo(buffer->destination); 2069 2070 uint32 size = buffer->size; 2071 segment.sequence = fSendNext.Number(); 2072 2073 TRACE("SendQueued(): buffer %p (%" B_PRIu32 " bytes) address %s to " 2074 "%s flags %#" B_PRIx8 ", seq %" B_PRIu32 ", ack %" B_PRIu32 2075 ", rwnd %" B_PRIu16 ", cwnd %" B_PRIu32 ", ssthresh %" B_PRIu32 2076 ", len %" B_PRIu32 ", first %" B_PRIu32 ", last %" B_PRIu32, 2077 buffer, buffer->size, PrintAddress(buffer->source), 2078 PrintAddress(buffer->destination), segment.flags, segment.sequence, 2079 segment.acknowledge, segment.advertised_window, 2080 fCongestionWindow, fSlowStartThreshold, segmentLength, 2081 fSendQueue.FirstSequence().Number(), 2082 fSendQueue.LastSequence().Number()); 2083 T(Send(this, segment, buffer, fSendQueue.FirstSequence(), 2084 fSendQueue.LastSequence())); 2085 2086 PROBE(buffer, sendWindow); 2087 sendWindow -= buffer->size; 2088 2089 status = add_tcp_header(AddressModule(), segment, buffer); 2090 if (status != B_OK) { 2091 gBufferModule->free(buffer); 2092 return status; 2093 } 2094 2095 // Update send status - we need to do this before we send the data 2096 // for local connections as the answer is directly handled 2097 2098 if (segment.flags & TCP_FLAG_SYNCHRONIZE) { 2099 segment.options &= ~TCP_HAS_WINDOW_SCALE; 2100 segment.max_segment_size = 0; 2101 size++; 2102 } 2103 2104 if (segment.flags & TCP_FLAG_FINISH) 2105 size++; 2106 2107 uint32 sendMax = fSendMax.Number(); 2108 fSendNext += size; 2109 if (fSendMax < fSendNext) 2110 fSendMax = fSendNext; 2111 2112 fReceiveMaxAdvertised = fReceiveNext 2113 + ((uint32)segment.advertised_window << fReceiveWindowShift); 2114 2115 status = next->module->send_routed_data(next, fRoute, buffer); 2116 if (status < B_OK) { 2117 gBufferModule->free(buffer); 2118 2119 fSendNext = segment.sequence; 2120 fSendMax = sendMax; 2121 // restore send status 2122 return status; 2123 } 2124 2125 if (shouldStartRetransmitTimer && size > 0) { 2126 TRACE("starting initial retransmit timer of: %" B_PRIdBIGTIME, 2127 fRetransmitTimeout); 2128 gStackModule->set_timer(&fRetransmitTimer, fRetransmitTimeout); 2129 T(TimerSet(this, "retransmit", fRetransmitTimeout)); 2130 shouldStartRetransmitTimer = false; 2131 } 2132 2133 if (segment.flags & TCP_FLAG_ACKNOWLEDGE) 2134 fLastAcknowledgeSent = segment.acknowledge; 2135 2136 length -= segmentLength; 2137 segment.flags &= ~(TCP_FLAG_SYNCHRONIZE | TCP_FLAG_RESET 2138 | TCP_FLAG_FINISH); 2139 2140 if (retransmit) 2141 break; 2142 2143 } while (length > 0); 2144 2145 return B_OK; 2146 } 2147 2148 2149 int 2150 TCPEndpoint::_MaxSegmentSize(const sockaddr* address) const 2151 { 2152 return next->module->get_mtu(next, address) - sizeof(tcp_header); 2153 } 2154 2155 2156 status_t 2157 TCPEndpoint::_PrepareSendPath(const sockaddr* peer) 2158 { 2159 if (fRoute == NULL) { 2160 fRoute = gDatalinkModule->get_route(Domain(), peer); 2161 if (fRoute == NULL) 2162 return ENETUNREACH; 2163 2164 if ((fRoute->flags & RTF_LOCAL) != 0) 2165 fFlags |= FLAG_LOCAL; 2166 } 2167 2168 // make sure connection does not already exist 2169 status_t status = fManager->SetConnection(this, *LocalAddress(), peer, 2170 fRoute->interface_address->local); 2171 if (status < B_OK) 2172 return status; 2173 2174 fInitialSendSequence = system_time() >> 4; 2175 fSendNext = fInitialSendSequence; 2176 fSendUnacknowledged = fInitialSendSequence; 2177 fSendMax = fInitialSendSequence; 2178 fSendUrgentOffset = fInitialSendSequence; 2179 2180 // we are counting the SYN here 2181 fSendQueue.SetInitialSequence(fSendNext + 1); 2182 2183 fReceiveMaxSegmentSize = _MaxSegmentSize(peer); 2184 2185 // Compute the window shift we advertise to our peer - if it doesn't support 2186 // this option, this will be reset to 0 (when its SYN is received) 2187 fReceiveWindowShift = 0; 2188 while (fReceiveWindowShift < TCP_MAX_WINDOW_SHIFT 2189 && (0xffffUL << fReceiveWindowShift) < socket->receive.buffer_size) { 2190 fReceiveWindowShift++; 2191 } 2192 2193 return B_OK; 2194 } 2195 2196 2197 void 2198 TCPEndpoint::_Acknowledged(tcp_segment_header& segment) 2199 { 2200 TRACE("_Acknowledged(): ack %" B_PRIu32 "; uack %" B_PRIu32 "; next %" 2201 B_PRIu32 "; max %" B_PRIu32, segment.acknowledge, 2202 fSendUnacknowledged.Number(), fSendNext.Number(), fSendMax.Number()); 2203 2204 ASSERT(fSendUnacknowledged <= segment.acknowledge); 2205 2206 if (fSendUnacknowledged < segment.acknowledge) { 2207 fSendQueue.RemoveUntil(segment.acknowledge); 2208 fSendUnacknowledged = segment.acknowledge; 2209 if (fSendNext < fSendUnacknowledged) 2210 fSendNext = fSendUnacknowledged; 2211 2212 if (segment.options & TCP_HAS_TIMESTAMPS) 2213 _UpdateRoundTripTime(tcp_diff_timestamp(segment.timestamp_reply)); 2214 else { 2215 // TODO: Fallback to RFC 793 type estimation; This just resets 2216 // any potential exponential back off that happened due to 2217 // retransmits. 2218 fRetransmitTimeout = TCP_INITIAL_RTT; 2219 } 2220 2221 if (fSendUnacknowledged == fSendMax) { 2222 TRACE("all acknowledged, cancelling retransmission timer"); 2223 gStackModule->cancel_timer(&fRetransmitTimer); 2224 T(TimerSet(this, "retransmit", -1)); 2225 } else { 2226 TRACE("data acknowledged, resetting retransmission timer to: %" 2227 B_PRIdBIGTIME, fRetransmitTimeout); 2228 gStackModule->set_timer(&fRetransmitTimer, fRetransmitTimeout); 2229 T(TimerSet(this, "retransmit", fRetransmitTimeout)); 2230 } 2231 2232 if (is_writable(fState)) { 2233 // notify threads waiting on the socket to become writable again 2234 fSendCondition.NotifyAll(); 2235 gSocketModule->notify(socket, B_SELECT_WRITE, fSendQueue.Free()); 2236 } 2237 2238 if (fCongestionWindow < fSlowStartThreshold) 2239 fCongestionWindow += fSendMaxSegmentSize; 2240 } 2241 2242 if (fCongestionWindow >= fSlowStartThreshold) { 2243 uint32 increment = fSendMaxSegmentSize * fSendMaxSegmentSize; 2244 2245 if (increment < fCongestionWindow) 2246 increment = 1; 2247 else 2248 increment /= fCongestionWindow; 2249 2250 fCongestionWindow += increment; 2251 } 2252 2253 // if there is data left to be sent, send it now 2254 if (fSendQueue.Used() > 0) 2255 _SendQueued(); 2256 } 2257 2258 2259 void 2260 TCPEndpoint::_Retransmit() 2261 { 2262 TRACE("Retransmit()"); 2263 2264 _ResetSlowStart(); 2265 fSendNext = fSendUnacknowledged; 2266 2267 // Do exponential back off of the retransmit timeout 2268 fRetransmitTimeout *= 2; 2269 if (fRetransmitTimeout > TCP_MAX_RETRANSMIT_TIMEOUT) 2270 fRetransmitTimeout = TCP_MAX_RETRANSMIT_TIMEOUT; 2271 2272 _SendQueued(); 2273 } 2274 2275 2276 void 2277 TCPEndpoint::_UpdateRoundTripTime(int32 roundTripTime) 2278 { 2279 int32 rtt = roundTripTime; 2280 2281 // "smooth" round trip time as per Van Jacobson 2282 rtt -= fRoundTripTime / 8; 2283 fRoundTripTime += rtt; 2284 if (rtt < 0) 2285 rtt = -rtt; 2286 rtt -= fRoundTripDeviation / 4; 2287 fRoundTripDeviation += rtt; 2288 2289 fRetransmitTimeout = ((fRoundTripTime / 4 + fRoundTripDeviation) / 2) 2290 * kTimestampFactor; 2291 if (fRetransmitTimeout < TCP_MIN_RETRANSMIT_TIMEOUT) 2292 fRetransmitTimeout = TCP_MIN_RETRANSMIT_TIMEOUT; 2293 2294 TRACE(" RTO is now %" B_PRIdBIGTIME " (after rtt %" B_PRId32 "ms)", 2295 fRetransmitTimeout, roundTripTime); 2296 } 2297 2298 2299 void 2300 TCPEndpoint::_ResetSlowStart() 2301 { 2302 fSlowStartThreshold = max_c((fSendMax - fSendUnacknowledged).Number() / 2, 2303 2 * fSendMaxSegmentSize); 2304 fCongestionWindow = fSendMaxSegmentSize; 2305 } 2306 2307 2308 // #pragma mark - timer 2309 2310 2311 /*static*/ void 2312 TCPEndpoint::_RetransmitTimer(net_timer* timer, void* _endpoint) 2313 { 2314 TCPEndpoint* endpoint = (TCPEndpoint*)_endpoint; 2315 T(TimerTriggered(endpoint, "retransmit")); 2316 2317 MutexLocker locker(endpoint->fLock); 2318 if (!locker.IsLocked()) 2319 return; 2320 2321 endpoint->_Retransmit(); 2322 } 2323 2324 2325 /*static*/ void 2326 TCPEndpoint::_PersistTimer(net_timer* timer, void* _endpoint) 2327 { 2328 TCPEndpoint* endpoint = (TCPEndpoint*)_endpoint; 2329 T(TimerTriggered(endpoint, "persist")); 2330 2331 MutexLocker locker(endpoint->fLock); 2332 if (!locker.IsLocked()) 2333 return; 2334 2335 // the timer might not have been canceled early enough 2336 if (endpoint->State() == CLOSED) 2337 return; 2338 2339 endpoint->_SendQueued(true); 2340 } 2341 2342 2343 /*static*/ void 2344 TCPEndpoint::_DelayedAcknowledgeTimer(net_timer* timer, void* _endpoint) 2345 { 2346 TCPEndpoint* endpoint = (TCPEndpoint*)_endpoint; 2347 T(TimerTriggered(endpoint, "delayed ack")); 2348 2349 MutexLocker locker(endpoint->fLock); 2350 if (!locker.IsLocked()) 2351 return; 2352 2353 // the timer might not have been canceled early enough 2354 if (endpoint->State() == CLOSED) 2355 return; 2356 2357 endpoint->SendAcknowledge(true); 2358 } 2359 2360 2361 /*static*/ void 2362 TCPEndpoint::_TimeWaitTimer(net_timer* timer, void* _endpoint) 2363 { 2364 TCPEndpoint* endpoint = (TCPEndpoint*)_endpoint; 2365 T(TimerTriggered(endpoint, "time-wait")); 2366 2367 MutexLocker locker(endpoint->fLock); 2368 if (!locker.IsLocked()) 2369 return; 2370 2371 if ((endpoint->fFlags & FLAG_CLOSED) == 0) { 2372 endpoint->fFlags |= FLAG_DELETE_ON_CLOSE; 2373 return; 2374 } 2375 2376 locker.Unlock(); 2377 2378 gSocketModule->release_socket(endpoint->socket); 2379 } 2380 2381 2382 /*static*/ status_t 2383 TCPEndpoint::_WaitForCondition(ConditionVariable& condition, 2384 MutexLocker& locker, bigtime_t timeout) 2385 { 2386 ConditionVariableEntry entry; 2387 condition.Add(&entry); 2388 2389 locker.Unlock(); 2390 status_t result = entry.Wait(B_ABSOLUTE_TIMEOUT | B_CAN_INTERRUPT, timeout); 2391 locker.Lock(); 2392 2393 return result; 2394 } 2395 2396 2397 // #pragma mark - 2398 2399 2400 void 2401 TCPEndpoint::Dump() const 2402 { 2403 kprintf("TCP endpoint %p\n", this); 2404 kprintf(" state: %s\n", name_for_state(fState)); 2405 kprintf(" flags: 0x%" B_PRIx32 "\n", fFlags); 2406 #if KDEBUG 2407 kprintf(" lock: { %p, holder: %" B_PRId32 " }\n", &fLock, fLock.holder); 2408 #endif 2409 kprintf(" accept sem: %" B_PRId32 "\n", fAcceptSemaphore); 2410 kprintf(" options: 0x%" B_PRIx32 "\n", (uint32)fOptions); 2411 kprintf(" send\n"); 2412 kprintf(" window shift: %" B_PRIu8 "\n", fSendWindowShift); 2413 kprintf(" unacknowledged: %" B_PRIu32 "\n", 2414 fSendUnacknowledged.Number()); 2415 kprintf(" next: %" B_PRIu32 "\n", fSendNext.Number()); 2416 kprintf(" max: %" B_PRIu32 "\n", fSendMax.Number()); 2417 kprintf(" urgent offset: %" B_PRIu32 "\n", fSendUrgentOffset.Number()); 2418 kprintf(" window: %" B_PRIu32 "\n", fSendWindow); 2419 kprintf(" max window: %" B_PRIu32 "\n", fSendMaxWindow); 2420 kprintf(" max segment size: %" B_PRIu32 "\n", fSendMaxSegmentSize); 2421 kprintf(" queue: %" B_PRIuSIZE " / %" B_PRIuSIZE "\n", fSendQueue.Used(), 2422 fSendQueue.Size()); 2423 #if DEBUG_BUFFER_QUEUE 2424 fSendQueue.Dump(); 2425 #endif 2426 kprintf(" last acknowledge sent: %" B_PRIu32 "\n", 2427 fLastAcknowledgeSent.Number()); 2428 kprintf(" initial sequence: %" B_PRIu32 "\n", 2429 fInitialSendSequence.Number()); 2430 kprintf(" receive\n"); 2431 kprintf(" window shift: %" B_PRIu8 "\n", fReceiveWindowShift); 2432 kprintf(" next: %" B_PRIu32 "\n", fReceiveNext.Number()); 2433 kprintf(" max advertised: %" B_PRIu32 "\n", 2434 fReceiveMaxAdvertised.Number()); 2435 kprintf(" window: %" B_PRIu32 "\n", fReceiveWindow); 2436 kprintf(" max segment size: %" B_PRIu32 "\n", fReceiveMaxSegmentSize); 2437 kprintf(" queue: %" B_PRIuSIZE " / %" B_PRIuSIZE "\n", 2438 fReceiveQueue.Available(), fReceiveQueue.Size()); 2439 #if DEBUG_BUFFER_QUEUE 2440 fReceiveQueue.Dump(); 2441 #endif 2442 kprintf(" initial sequence: %" B_PRIu32 "\n", 2443 fInitialReceiveSequence.Number()); 2444 kprintf(" duplicate acknowledge count: %" B_PRIu32 "\n", 2445 fDuplicateAcknowledgeCount); 2446 kprintf(" round trip time: %" B_PRId32 " (deviation %" B_PRId32 ")\n", 2447 fRoundTripTime, fRoundTripDeviation); 2448 kprintf(" retransmit timeout: %" B_PRId64 "\n", fRetransmitTimeout); 2449 kprintf(" congestion window: %" B_PRIu32 "\n", fCongestionWindow); 2450 kprintf(" slow start threshold: %" B_PRIu32 "\n", fSlowStartThreshold); 2451 } 2452 2453